@article{3d:loh:micro2007,
   Author = {Gabriel H. Loh and Yuan Xie and Bryan Black},
   Title = {{Processor Design in 3D Die-Stacking Technologies}},
   Journal = {MICRO},
      Year = {2007} }

@article{ICbook,
   Author = {Jan Rabaey and Anantha Chandrakasan and Borivoje Nikolic},
   Title = {{Digital Integrated Circuits}},
   Journal = {Prentice-Hall},
      Year = {2003} }

@article{3d:cost,
   Author = {Larry Smith and Greg Smith and Sharath Hosali and Sitaram Arkalgud},
   Title = {{3D: It All Comes Down to Cost}},
   Journal = {Proceedings of RTI Conference of 3D Architecture for Semiconductors and Packaging},
      Year = {2007} }

@article{3d:BMN+2006,
   Author = {Bryan Black and et al.},
   Title = {Die Stacking (3D) Microarchitecture},
   Journal = {MICRO},
      Year = {2006} }

@article{
3d-AFG+05,
   Author = {Ababei, C. and Feng, Y. and Goplen, B. and Mogal, H. and Zhang, T. and Bazargan, K. and Sapatnekar, S. S.},
   Title = {{Placement and Routing in 3D Integrated Circuits}},
   Journal = {IEEE Design and Test of Computers},
   Volume = {22},
   Number = {6},
   Pages = {520- 531},
   Abstract = {Three-dimension technologies offer great promise in providing improvements in the overall circuit performance. Physical design plays a major role in the ability to exploit the flexibilities offered in the third dimension, and this article gives an overview of placement and routing methods for FPGA- and ASIC-style designs. We describe CAD techniques for placement and routing in 3D ICs, developed under our 3D analysis and design optimization framework. These approaches address a dichotomy of design styles, both FPGA and ASIC. The factors that are important in each style are different, so that a one-size-fits-all approach is impractical, and therefore, we present separate approaches for 3D physical design for each of these technologies. Hence, our FPGA placement method uses a two-step optimization process that minimizes inter-tier vias first, followed by further optimization within and across tiers. In contrast, the ASIC flow uses cost function weighting to discourage, but not minimize, inter-tier crossings.},
      Year = {2005} }


@inproceedings{
3d-AB04,
   Author = {Ababei, C. and K. Bazargan},
   Title = {{Exploring Potential Benefits of 3D FPGA Integration}},
   BookTitle = {Field-Programmable Logic and its Applications},
   Abstract = {A new timing-driven partitioning-based placement tool for 3D
FPGA integration is presented. The circuit is first divided into
layers with limited number of inter-layer vias, and then placement is
performed on individual layers, while minimizing the delay of
critical paths. We use our tool, which will be available on the web
for the research community, as a platform for exploring potential
benefits in terms of delay and wire-length that 3D technologies have
to offer for FPGA fabrics. We show that 3D integration results in
wire-length reduction for FPGA designs. However, unlike the ASIC
case, wire-length reduction does not automatically translate to much
smaller circuit delays, unless multi-segment lengths are employed
between layers. Our empirical analysis shows that wire-length can
be reduced by up to 50% (20% on average) using 5 layers. Delay
reductions are estimated to be up to 30% (15% on average) using
the same number of layers.},
      Year = {2004} }



@inproceedings{
3d-AMB05,
   Author = {{Ababei, C. and et al.}},
   Title = {{Three-dimensional Place and Route for FPGAs}},
   BookTitle = {Asia South-Pacific Design Automation Conference},
   Pages = {773 - 778},
      Year = {2005} }



@article{
3D:AMB06,
   Author = {Ababei, C. and et al.},
   Title = {{Three-Dimensional Place and Route for FPGAs}},
   Journal = {Computer-Aided Design of Integrated Circuits and Systems, IEEE Transactions on},
   Volume = {25},
   Number = {6},
   Pages = {1132-1140},
   Note = {0278-0070},
   Abstract = {We present timing-driven partitioning and simulated-annealing (SA)-based placement algorithms together with a detailed routing tool for three-dimensional (3-D) field-programmable gate array (FPGA) integration. The circuit is first divided into layers with a limited number of interlayer vias, and then placed on individual layers, while minimizing the delay of critical paths. We use our tool as a platform to explore the potential benefits, in terms of delay and wire length (WL), that 3-D technologies can offer for FPGA fabrics. Experimental results show, on average, a total decrease of 25% in WL and 35% in delay can be achieved over traditional two-dimensional chips, when ten layers are used in 3-D integration.},
   Keywords = {Field-programmable gate arrays (FPGAs)
routing
three-dimensional (3-D) circuits
timing-driven placement
Field-programmable gate arrays (FPGAs)
routing
three-dimensional (3-D) circuits
timing-driven placement},
   Year = {2006} }



@inproceedings{
3D:AHK+00,
   Author = {Agarwal, V. and Hrishikesh, M. S. and Keckler, S. W. and Burger, D.},
   Title = {Clock rate versus IPC: the end of the road for conventional microarchitectures},
   BookTitle = {Computer Architecture, 2000. Proceedings of the 27th International Symposium on },
   Pages = {248-259},
   Abstract = {The doubling microprocessor performance every three years has been the result of two factors: more transistors per chip and superlinear scaling of the processor dock with technology generation. Our results show that, due to both diminishing improvements in clock rates and poor wire scaling as semiconductor devices shrink, the achievable performance growth of conventional microarchitectures will slow substantially. In this paper, we describe technology-driven models for wire capacitance wire delay, and microarchitectural component delay. Using the results of these models, we measure the simulated performance-estimating both clock rate and IPC-of an aggressive out-of-order microarchitecture as it is scaled from a 250 nm technology to a 35 nm technology. We perform this analysis for three clock scaling targets and two microarchitecture scaling strategies: pipeline scaling and capacity scaling. We find that no scaling strategy permits annual performance improvements of better than 12.5% which is far worse than the annual 50-60% to which we have grown accustomed.},
   Keywords = {delays
microprocessor chips
performance evaluation
IPC
capacity scaling
clock rate
doubling microprocessor performance
microarchitectures
performance growth
pipeline scaling
technology-driven models
wire capacitance wire delay},
   Year = {2000} }



@inproceedings{
3D:AGT+04,
   Author = {Alam, S. M. and Gan Chee, Lip and Thompson, C. V. and Troxel, D. E.},
   Title = {Circuit level reliability analysis of Cu interconnects},
   BookTitle = {Quality Electronic Design, 2004. Proceedings. 5th International Symposium on },
   Pages = {238-243},
   Abstract = {Copper (Cu) based interconnect technology is expected to meet some of the challenges of technology scaling in the pursuit of higher performance. However, Cu interconnects are still susceptible to electromigration-induced failure over time. We describe a new hierarchical approach for predicting the reliability of Cu-based interconnects in circuit layouts, and present an RCAD tool, SysRel, for such an analysis. We propose a (jL) product filtering algorithm with a classification of separate via-above and via-below treatments in Cu interconnect trees. After the filtering of immortal trees, a default model is applied to the remaining trees to compute reliability figures for individual units. SysRel utilizes joint stochastic reliability metrics based on the desired lifetime of a chip and combines reliability figures from individual fundamental reliability units. Simulation results with a 32-bit comparator circuit layout demonstrate the significance of our methodology in selectively identifying critical nets and their impacts on overall reliability.},
   Keywords = {copper
current density
electromigration
electronic design automation
integrated circuit interconnections
integrated circuit metallisation
integrated circuit reliability
Cu
RCAD tool
circuit level reliability analysis
comparator circuit layout
copper based interconnect
critical nets
default model
desired lifetime
electromigration-induced failure
hierarchical approach
interconnect trees
joint stochastic reliability metrics
product filtering algorithm},
   Year = {2004} }



@inproceedings{
3D:ATT02,
   Author = {Alam, S. M. and Troxel, D. E. and Thompson, C. V.},
   Title = {A comprehensive layout methodology and layout-specific circuit analyses for three-dimensional integrated circuits},
   BookTitle = {Quality Electronic Design, 2002. Proceedings. International Symposium on },
   Pages = {246-251},
   Abstract = {In this paper, we descrie a comprehensive layout methodology for bonded three-dimensional integrated circuits (3D ICs). In bonded 3D integration technology, parts of a circuit are fabricated on different wafers, and then, the wafers are bonded with a glue layer of Cu or polymer based adhesive. Using our layout methodology, designers can layout such 3D circuits with necessary information on inter-wafer via/contact and orientation of each wafer embedded in the layout. We have implemented the layout methodology in 3DMagic. Availability of 3DMagic has led to interesting research with a wide range of layout-specific circuit analyses, from performance comparison of 2D and 3D circuits to layout-specific reliability analyses in 3D circuits. Using 3DMagic, researchers have designed and simulated an 8-bit encryption processor mapped into 2D and 3D FPGA layouts. Moreover, the layout methodology is an essential element of our ongoing research for the framework of a novel reliability computer aided design tool, ERNI-3D.},
   Keywords = {adhesives
circuit layout CAD
circuit simulation
cryptography
field programmable gate arrays
integrated circuit interconnections
integrated circuit layout
integrated circuit reliability
microprocessor chips
wafer bonding
3D circuit layout
3D integrated circuits
3DMagic layout method implementation
8 bit
Cu
Cu glue layer
ERNI-3D reliability computer aided design tool
FPGA layouts
bonded 3D ICs
bonded 3D integration technology
encryption processor
inter-wafer via/contact
layout methodology
layout-specific circuit analysis
layout-specific reliability analysis
performance comparison
polymer based adhesive
wafer bonding
wafer orientation},
   Year = {2002} }



@inproceedings{
3D:AK97,
   Author = {Albonesi, D. H. and Koren, I.},
   Title = {Improving the memory bandwidth of highly-integrated, wide-issue, microprocessor-based systems},
   BookTitle = {Parallel Architectures and Compilation Techniques., 1997. Proceedings. 1997 International Conference on },
   Pages = {126-135},
   Abstract = {Next-generation wide-issue processors will require greater memory bandwidth than provided by present memory hierarchy designs. We propose techniques for increasing the memory bandwidth of multi-ported L1 D-caches, large on-chip L2 caches and dedicated memory ports while minimizing the cycle time impact. These approaches are evaluated within the context of an 8-way superscalar processor design and next-generation VLSI, packaging and RAM technologies. We show that the combined L1 and L2 cache enhancements can outperform conventional techniques by over 80%, and that even with an on-chip 512-kByte L2 cache, board-level caches provide significant enough performance gains to justify their higher cost},
   Keywords = {VLSI
cache storage
integrated circuit packaging
microcomputers
microprocessor chips
performance evaluation
random-access storage
512 kByte
8-way superscalar processor design
board-level caches
cost
cycle time impact minimization
dedicated memory ports
highly-integrated wide-issue microprocessor-based systems
memory bandwidth
memory hierarchy designs
multi-ported L1 D-caches
next-generation RAM technology
next-generation VLSI technology
next-generation packaging technology
on-chip L2 caches
performance gains},
   Year = {1997} }



@inproceedings{
3D:ABC93,
   Author = {Alexander, M. A. and Bailey, M. W. and Childers, B. R. and Davidson, J. W. and Jinturkar, S.},
   Title = {Memory bandwidth optimizations for wide-bus machines},
   BookTitle = {System Sciences, 1993, Proceeding of the Twenty-Sixth Hawaii International Conference on },
   Volume = {i},
   Pages = {466-475 vol.1},
   Abstract = {The authors describe and evaluate the effectiveness of some code improvement techniques that are designed to take advantage of wide-bus machines (WBMs): that is, a microprocessor with a memory bus width at least twice the size of the integer data type handled by the processor and assumed by the programmer. They discuss some compiler optimizations that take advantage of th increased bandwidth available from a wide bus. The investigations show that WBMs can expect reduction in memory bus cycles on the order of 5 to 15%. Using new code improvement algorithms designed to exploit the availability of a wide bus, the studies show that, for many memory-insensitive algorithms, it is possible to reduce the number of memory loads and stores by 30 to 40%.<<ETX>>},
   Keywords = {computer architecture
microcomputers
WBMs
code improvement
code improvement algorithms
memory bandwidth optimisation
memory bus cycles
memory bus width
memory-insensitive algorithms
microprocessor
wide-bus machines},
   Year = {1993} }



@inproceedings{
3D:ACC+95,
   Author = {Alexander, M. J. and Cohoon, J. P. and Colflesh, J. L. and Karro, J. and Robins, G.},
   Title = {Three-dimensional field-programmable gate arrays},
   BookTitle = {ASIC Conference and Exhibit, 1995., Proceedings of the Eighth Annual IEEE International },
   Pages = {253-256},
   Abstract = {Motivated by improving FPGA performance, we propose a new three-dimensional (3D) FPGA architecture, along with a fabrication methodology. We analyze the expected manufacturing yield, and raise several physical-design issues in the new 3D paradigm. Our techniques also have good implications for resource utilization, physical size, and power consumption},
   Keywords = {VLSI
application specific integrated circuits
field programmable gate arrays
integrated circuit design
integrated circuit yield
logic CAD
logic partitioning
network routing
3D field-programmable gate arrays
3D paradigm
FPGA performance
fabrication methodology
manufacturing yield
physical size
physical-design issues
power consumption
resource utilization},
   Year = {1995} }



@inproceedings{ 3D:BWF+07,
   Author = {B. Vaidyanathan and W. Hung and F. Wang and Y. Xie and V. Narayanan and M. J. Irwin},
   Title = {{Architecting Microprocessor Components in 3D Design Space}},
   BookTitle = {VLSI Design},
   Pages = {103-108},
   Year = {2007} }



@article{
3D:BSK+01,
   Author = {Banerjee, K. and Souri, S. J. and Kapur, P. and Saraswat, K. C.},
   Title = {3-D ICs: a novel chip design for improving deep-submicrometer interconnect performance and systems-on-chip integration},
   Journal = {Proceedings of the IEEE},
   Volume = {89},
   Number = {5},
   Pages = {602-633},
   Note = {0018-9219},
   Abstract = {Performance of deep-submicrometer very large scale integrated (VLSI) circuits is being increasingly dominated by the interconnects due to decreasingwire pitch and increasing die size. Additionally, heterogeneous integration of different technologies in one single chip is becoming increasingly desirable, for which planar (two-dimensional) ICs may not be suitable. This paper analyzes the limitations of the existing interconnect technologies and design methodologies and presents a novel three-dimensional (3-D) chip design strategy that exploits the vertical dimension to alleviate the interconnect related problems and to facilitate heterogeneous integration of technologies to realize a system-on-a-chip (SoC) design. A comprehensive analytical treatment of these 3-D ICs has been presented and it has been shown that by simply dividing a planar chip into separate blocks, each occurring a separate physical level interconnected by short and vertical interlayer interconnects (VILICs), significant improvement in performance and reduction in wire-limited chip area can be achieved, without the aid of any other circuit or design innovations. A scheme to optimize the interconnect distribution among different interconnect tiers is presented and the effect of transferring the repeaters to upper Si layers has been quantified in this analysis for a two-layer 3-D chip. Furthermore, one of the major concerns in 3-D ICs arising due to power dissipation problems has been analyzed and an analytical model has been presented to estimate the temperatures of the different active layers. It is demonstrated that advancement in heat sinking technology will be necessary in order to extract maximum performance from these chips. Implications of 3-D device architecture on several design issues have also been discussed with special attention to SoC design strategies. Finally some of the promising technologies for manufacturing 3-D ICs have been outlined},
   Keywords = {VLSI
integrated circuit design
integrated circuit interconnections
integrated circuit modelling
VLSI chip design
analytical model
deep-submicron interconnect
heterogeneous integration
system-on-a-chip
three-dimensional integrated circuit
vertical interlayer interconnect},
   Year = {2001} }



@inproceedings{
3D:BER06,
   Author = {Bernstein, K.},
   Title = {Introduction to 3D integration},
   BookTitle = {International Solid State Circuits Conference Tutorial},
      Year = {2006} }



@article{
3D:kerry-EDA06,
   Author = {Bernstein, Kerry},
   Title = {{New Dimension in Performance}},
   Journal = {EDA Forum},
   Volume = {3},
   Number = {2},
      Year = {2006} }



@inproceedings{
3D:BNW+04,
   Author = {Black, B. and Nelson, D. W. and Webb, C. and Samra, N.},
   Title = {{3D processing technology and its impact on iA32 microprocessors}},
   BookTitle = { ICCD},
   Pages = {316-318},
   Abstract = {This short paper explores an implementation of a new technology called 3D die stacking and describes research activity at Intel. 3D die stacking is the bonding of two die either face-to-face or face-to-back in order to construct the 3D structure. In this work, a face-to-face bonding is utilized because it yields a higher density die-to-die inter-connect than is possible with face-to-back. With sufficiently dense die-to-die interconnect devices as complex as an iA32 microprocessor can be repartitioned or split between two die in order to simultaneously improve performance and power. The 3D structure of this emerging technology is examined and applied in this paper to a real x86 deeply pipelined high performance microprocessor. In this initial study, it is shown that a 3D implementation can potentially improve the performance by 15% while improving power by 15%.},
   Keywords = {bonding processes
integrated circuit interconnections
integrated circuit layout
microprocessor chips
pipeline processing
3D die stacking
3D processing technology
3D structure
die-to-die interconnect devices
face-to-back bonding
face-to-face bonding
iA32 microprocessors
integrated circuit interconnections
integrated circuit layout
real x86 deeply pipelined high performance microprocessor},
   Year = {2004} 



@inproceedings{
WATTCH,
   Author = {Brooks, D. and Tiwari, V. and Martonosi, M.},
   Title = {Wattch: a framework for architectural-level power analysis and optimizations},
   BookTitle = {International Symposium on Computer Architecture},
   Pages = {83-94},
   Abstract = {Power dissipation and thermal issues are increasingly significant in modern processors. As a result, it is crucial that power/performance tradeoffs be made more visible to chip architects and even compiler writers, in addition to circuit designers. Most existing power analysis tools achieve high accuracy by calculating power estimates for designs only after layout or floorplanning are complete. In addition to being available only late in the design process, such tools are often quite slow, which compounds the difficulty of running them for a large space of design possibilities. This paper presents Wattch, a framework for analyzing and optimizing microprocessor power dissipation at the architecture-level. Wattch is 1000X or more faster than existing layout-level power tools, and yet maintains accuracy within 10% of their estimates as verified using industry tools on leading-edge designs. This paper presents several validations of Wattch's accuracy. In addition, we present three examples that demonstrate how architects or compiler writers might use Wattch to evaluate power consumption in their design process. We see Wattch as a complement to existing lower-level tools; it allows architects to explore and cull the design space early on, using faster, higher-level tools. It also opens up the field of power-efficient computing to a wider range of researchers by providing a power evaluation methodology within the portable and familiar SimpleScalar framework.},
   Keywords = {circuit CAD
computer architecture
microprocessor chips
power consumption
Wattch
architecture-level
microprocessor power dissipation
power analysis
power analysis tools
power consumption
power dissipation
power/performance tradeoffs},
   Year = {2000} }



@article{
3D:BGK97,
   Author = {Burger, D. and Goodman, J. R. and Kagi, A.},
   Title = {Limited bandwidth to affect processor design},
   Journal = {Micro, IEEE},
   Volume = {17},
   Number = {6},
   Pages = {55-62},
   Note = {0272-1732},
   Abstract = {This paper quantifies and compares the performance impacts of memory latencies and finite bandwidth. We show that the implementation of aggressive latency tolerance techniques aggravates stalls due to finite memory bandwidth, which actually become more significant than stalls resulting from uncongested memory latency alone. We expect that memory bandwidth limitations across the processor pins will drive significant architectural change. An execution-driven simulation measures the time that several SPEC95 benchmarks spend stalled for memory latency, limited-memory bandwidth and computing},
   Keywords = {discrete event simulation
microprocessor chips
SPEC95 benchmarks
execution-driven simulation
finite bandwidth
limited bandwidth
memory bandwidth limitations
memory latencies
memory latency
performance impacts
processor design},
   Year = {1997} }



@inproceedings{
3D:CAR96,
   Author = {Carson, J.},
   Title = {The emergence of stacked 3D silicon and its impact on microelectronics systems integration},
   BookTitle = {Innovative Systems in Silicon, 1996. Proceedings., Eighth Annual IEEE International Conference on},
   Pages = {1-8},
   Abstract = {Stacked 3D silicon has been under development for a number of years at a substantial level of investment on the part of Government as well as public and private investors. Volume manufacturing of this technology is now in place and foundry services are provided to designers of Stacked 3D silicon components and products. Stacked 3D silicon has already had a major impact on microelectronics systems and products into which it has been integrated. Examples given include solid state data recorders, digital signal processors, massively parallel processors, artificial neural netwoks, imaging processing, and imaging sensors. Manufacturing and cost issues are identified and discussed along with present status and projections showing that, as volumes rise, no significant premium will be required to incorporate Stacked 3D Silicon into standard products. The performance advantages of Stacked 3D silicon are very large: the ultra-high scale density results in factors of hundreds to thousands in both speed and power when ICs are designed for 3D. The paper concludes with a picture of the coming next generation 3D stacked silicon: 10-1000 layers of ultra-thin, low power circuits with 1000s of inter-layer interconnect comprising entire systems in a single cube},
   Keywords = {ULSI
digital integrated circuits
economics
elemental semiconductors
integrated circuit interconnections
integrated circuit manufacture
integrated circuit technology
silicon
Si
data recorders
digital signal processors
foundry services
imaging processing
imaging sensors
inter-layer interconnect
microelectronics systems integration
neural networks
next generation devices
parallel processors
stacked 3D structures
ultra-high scale density
volume manufacturing},
   Year = {1996} }



@article{
3D:CFT+04,
   Author = {Chen, K. N. and Fan, A. and Tan, C. S. and Reif, R.},
   Title = {Contact resistance measurement of bonded copper interconnects for three-dimensional integration technology},
   Journal = {Electron Device Letters, IEEE},
   Volume = {25},
   Number = {1},
   Pages = {10-12},
   Note = {0741-3106},
   Abstract = {A novel test structure for contact resistance measurement of bonded copper interconnects in three-dimensional integration technology is proposed and fabricated. This test structure requires a simple fabrication process and eliminates the possibility of measurement errors due to misalignment during bonding. Specific contact resistances of bonding interfaces with different interconnect sizes of approximately 10/sup -8/ /spl Omega/-cm/sup 2/ are measured. A reduction in specific contact resistance is obtained by longer anneal time. The specific contact resistance of bonded interconnects with longer anneal time does not change with interconnect sizes.},
   Keywords = {contact resistance
copper
integrated circuit interconnections
integrated circuit modelling
3-dimensional integration technology
anneal time
bonded copper interconnects
bonding interfaces
bonding-based misalignment
contact resistance measurement
contact resistances
measurement errors
test structure},
   Year = {2004} }



@article{
3D:CS00,
   Author = {Ching-Han, Tsai and Sung-Mo, Kang},
   Title = {Cell-level placement for improving substrate thermal distribution},
   Journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
   Volume = {19},
   Number = {2},
   Pages = {253-266},
   Note = {0278-0070},
   Abstract = {The dramatic increase of power consumption in very large scale integration circuits has led to high operating temperature and large thermal gradient, thereby resulting in serious timing and reliability concerns. Temperature-tracking is thus becoming of paramount importance in modern electronic design automation (EDA) tools. In this paper we present two thermal placement tools for standard cell and macro cell design styles respectively. They are aimed at reducing hot spots in a design without compromising traditional design metrics such as area and wire length. We developed a compact substrate thermal model that can be used by the placer to calculate the temperature profile of a placement efficiently, or to convert the user-specified temperature constraint into the corresponding power distribution constraint as an alternative placement objective. As a result, our method is much more efficient than directly employing temperature profile simulation during the placement process. The simulation results show noticeable improvement of thermal distribution over the traditional placement algorithm, with little impact on area and wire length of the final layut},
   Keywords = {VLSI
circuit layout CAD
integrated circuit layout
integrated circuit modelling
integrated circuit reliability
matrix algebra
substrates
temperature distribution
thermal analysis
thermal resistance
timing
EDA tools
VLSI circuits
cell-level placement
compact substrate thermal model
electronic design automation tools
hot spots reduction
macro cell design style
power consumption
power distribution constraint
reliability
standard cell design style
substrate thermal distribution
temperature profile
temperature-tracking
thermal gradient
thermal placement tools
timing
user-specified temperature constraint
very large scale integration},
   Year = {2000} }



@inproceedings{
3D:CV98,
   Author = {Chiricescu, S. M. S. A. and Vai, M. M.},
   Title = {A three-dimensional FPGA with an integrated memory for in-application reconfiguration data},
   BookTitle = {Circuits and Systems, 1998. ISCAS '98. Proceedings of the 1998 IEEE International Symposium on },
   Volume = {2},
   Pages = {232-235 vol.2},
   Abstract = {The architecture of a 3-dimensional FPGA for reconfigurable computing machines is described. This FPGA architecture is based on a novel 3-D VLSI circuit technology developed at Northeastern University. A new interconnection scheme as well as a new reconfiguration mechanism are features of the architecture},
   Keywords = {CMOS logic circuits
VLSI
field programmable gate arrays
integrated circuit interconnections
integrated circuit layout
logic design
network routing
reconfigurable architectures
3D FPGA architecture
3D VLSI circuit technology
in-application reconfiguration data
integrated memory
interconnection scheme
reconfigurable computing machines
three-dimensional FPGA},
   Year = {1998} }



@article{
3D:CW98,
   Author = {Chu, C. C. N. and Wong, D. F.},
   Title = {A matrix synthesis approach to thermal placement},
   Journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
   Volume = {17},
   Number = {11},
   Pages = {1166-1174},
   Note = {0278-0070},
   Abstract = {In this paper, we consider the thermal placement problem for gate arrays. We introduce a new combinatorial optimization problem, matrix synthesis problem (MSP), to model the thermal placement problem. Given a list of mn nonnegative real numbers and an integer t, MSP constructs a m&times;n matrix out of the given numbers such that the maximum sum among all t&times;t submatrices is minimized. We show that MSP is},
   Keywords = {VLSI
circuit optimisation
integrated circuit layout
integrated circuit packaging
integrated circuit reliability
logic CAD
logic arrays
wiring
NP-complete
combinatorial optimization problem
matrix synthesis approach
matrix synthesis problem
nonnegative real numbers
provably good approximation algorithms
thermal placement
wiring},
   Year = {1998} }



@inproceedings{
3D:CONG04,
   Author = {Cong, J. and J. Wei and Y. Zhang},
   Title = {{A Thermal-Driven Floorplanning Algorithm for 3D ICs}},
   BookTitle = {International Conference on Computer Aided Design},
   Pages = {306-313},
   Abstract = {As the technology progresses, interconnect delays have become bottlenecks of chip performance. 3D integrated circuits are proposed as one way to address this problem. However, thermal problem is a critical challenge for 3D IC circuit design. We propose a thermal-driven 3D floorplanning algorithm. Our contributions include: (1) a new 3D floorplan representation, CBA and new interlayer local operations to more efficiently exploit the solution space; (2) an efficient thermal-driven 3D floorplanning algorithm with an integrated compact resistive network thermal model (CBA-T); (3) two fast thermal-driven 3D floorplanning algorithms using two different thermal models with different runtime and quality (CBA-T-Fast and CBA-T-Hybrid). Our experiments show that the proposed 3D floorplan algorithm with CBA representation can reduce the wirelength by 29% compared with a recent published result from Hsiu and et al., 2004). In addition, compared to a nonthermal-driven 3D floorplanning algorithm, the thermal-driven 3D floorplanning algorithm can reduce the maximum on-chip temperature by 56%.},
   Keywords = {integrated circuit interconnections
integrated circuit layout
thermal management (packaging)
3D IC circuit design
3D floorplan representation
3D integrated circuits
CBA
chip performance
integrated compact resistive network thermal model
interconnect delays
interlayer local operations
maximum on-chip temperature reduction
thermal-driven floorplanning algorithm
wirelength reduction},
   Year = {2004} }



@misc{
3D:FEM,
   Author = {Corp, Flomerics},
   Title = {Flotherm Modeling Software},
         Year = {} }



@article{
3D:CRI97,
   Author = {Crisp, R.},
   Title = {Direct RAMbus technology: the new main memory standard},
   Journal = {Micro, IEEE},
   Volume = {17},
   Number = {6},
   Pages = {18-28},
   Note = {0272-1732},
   Abstract = {Providing three times the memory bandwidth of the 66-MHz SDRAM subsystem, Direct RDRAM modules fit seamlessly into the existing mechanical space and airflow environment of the industry-standard PC chassis},
   Keywords = {DRAM chips
modules
Direct RDRAM
Direct RDRAM modules
main memory standard},
   Year = {1997} }



@inproceedings{
3D:DCR03_1,
   Author = {Das, S. and Chandrakasan, A. and Reif, R.},
   Title = {{Design Tools for 3-D Integrated Circuits}},
   BookTitle = {ASPDAC},
   Pages = {53-56},
   Abstract = {We present a set of design tools for 3-D integration. Using these tools - a 3-D standard-cell placement tool, global routing tool, and layout editor - we have targeted existing standard-cell circuit netlists for fabrication using wafer bonding. We have analyzed the performance of several circuits using these tools and find that 3-D integration provides significant benefits. For example, relative to single-die placement, we observe on average 28% to 51% reduction in total wire length.},
   Keywords = {VLSI
application specific integrated circuits
cellular arrays
circuit layout CAD
integrated circuit layout
network routing
3D IC design tools
3D integrated circuits
3D integration
3D standard-cell placement tool
global routing tool
layout editor
standard-cell circuit netlists
wafer bonding},
   Year = {2003} }



@inproceedings{
3D:DCR03,
   Author = {Das, S. and Chandrakasan, A. and Reif, R.},
   Title = {Three-dimensional integrated circuits: performance, design methodology, and CAD tools},
   BookTitle = {VLSI, 2003. Proceedings. IEEE Computer Society Annual Symposium on },
   Pages = {13-18},
   Abstract = {Three-dimensional integration technologies have been proposed in order to mitigate design challenges posed by deep-submicron interconnect. By providing multiple layers of active devices together with high-density local interconnects between these layers, 3-D technologies give digital-circuit designers greater freedom in meeting power and delay budgets that are increasingly interconnect-dominated. In this paper, we quantify the benefits 3-D integration can provide, using specific circuit benchmarks. We perform this analysis using a suite of circuit design tools we have developed for 3-D integration. We observe that on average, 28% to 51% reduction in total wire length is possible over two to five wafers respectively; similarly, 31% to 56% reduction in the length of the longest wire is achievable. We also characterize the impact of technology parameters on these reductions.},
   Keywords = {circuit CAD
digital integrated circuits
integrated circuit design
CAD tool
active device
deep-submicron interconnect
design methodology
digital circuit
three-dimensional integrated circuit
wire length},
   Year = {2003} }



@article{
3D:DCR04,
   Author = {Das, S. and Chandrakasan, A. P. and Reif, R.},
   Title = {Calibration of Rent's rule models for three-dimensional integrated circuits},
   Journa = {Very Large Scale Integration (VLSI) Systems, IEEE Transactions on},
   Volume = {12},
   Number = {4},
   Pages = {359-366},
   Note = {1063-8210},
   Abstract = {In this paper, we determine the accuracy of Rahman's interconnect prediction model for three-dimensional (3-D) integrated circuits. Utilizing this model, we calculate the wiring requirement for a set of benchmark standard-cell circuits. We then obtain placed and routed wirelength figures for these circuits using 3-D standard-cell placement and global-routing tools we have developed. We find that the Rahman model predicts wirelengths accurately (to within 20% of placement and of routing, on average), and suggest some areas for minor improvement to the model.},
   Keywords = {VLSI
calibration
circuit layout CAD
integrated circuit interconnections
integrated circuit layout
integrated circuit modelling
logic partitioning
network routing
Rahman model
Rent's rule models
benchmark standard-cell circuits
global-routing tools
model calibration
placed wirelength figures
routed wirelength figures
system-level interconnect prediction
three-dimensional integrated circuits
wafer-bonded structure
wiring requirement},
   Year = {2004} }



@article{
3D:DWM+05,
   Author = {Davis, W. R. and Wilson, J. and Mick, S. and Xu, J. and Hua, H. and Mineo, C. and Sule, A. M. and Steer, M. and Franzon, P. D.},
   Title = {{Demystifying 3D ICs: the Pros and Cons of Going Vertical}},
   Journal = {IEEE Design and Test of Computers},
   Volume = {22},
   Number = {6},
   Pages = {498- 510},
   Abstract = {This article provides a practical introduction to the design trade-offs of the currently available 3D IC technology options. It begins with an overview of techniques, such as wire bonding, microbumps, through vias, and contactless interconnection, comparing them in terms of vertical density and practical limits to their use. We then present a high-level discussion of the pros and cons of 3D technologies, with an analysis relating the number of transistors on a chip to the vertical interconnect density using estimates based on Rent's rule. Next, we provide a more detailed design example of inductively coupled interconnects, with measured results of a system fabricated in a 0.35-/spl mu/m technology and an analysis of misalignment and crosstalk tolerances. Lastly, we present a case study of a fast Fourier transform (FFT) placed and routed in a 0.18-/spl mu/m through-via silicon-on-insulator (SOI) technology, comparing the 3D design to a traditional 2D approach in terms of wire length and critical-path delay.},
      Year = {2005} }



@inproceedings{
3D:DM03,
   Author = {Deng, Y. and Maly, W.},
   Title = {{A Feasibility Study of 2.5D System Integration}},
   BookTitle = {Custom Integrated Circuits Conference, 2003. Proceedings of the IEEE 2003 },
   Pages = {667-670},
   Abstract = {Excessive on-chip wire length and fast increasing fabrication cost have been the main factors impairing the effectiveness of monolithic integration of VLSI systems. To address these problems, this paper investigates a die stacking based system integration scheme (2.5D system integration). We performed a series of design case studies and developed layout design tools for this new scheme. Our results show that this new scheme has a potential to outperform its monolithic equivalent.},
   Keywords = {VLSI
integrated circuit design
integrated circuit interconnections
integrated circuit packaging
multichip modules
2.5D system integration
VLSI monolithic integration
die stacking
layout design tools
on-chip wire length},
   Year = {2003} }



@inproceedings{
3D:DM03_1,
   Author = {Deng, Y. S. and Maly, W.},
   Title = {Physical design of the "2.5D" stacked system},
   BookTitle = {International Conference on Computer Design },
   Pages = {211-217},
   Abstract = {Excessive on-chip wire length and fast increasing fabrication cost have been the main factors impairing the effectiveness of monolithic system-on-chip. e investigate a die stacking based system integration strategy (2.5D system integration) to address these problems. The new scheme is design-tools-enabled rather than technology-driven. We developed a layout design framework, which is able to floorplan, place and route a VLSI design into stacked chips. Our results show that this new scheme has a potential to outperform its monolithic equivalent.},
   Keywords = {VLSI
circuit layout CAD
integrated circuit interconnections
integrated circuit layout
system-on-chip
2.5D stacked system design
VLSI design
circuit layout design
die stacking
floorplanning
monolithic system-on-chip
system integration strategy},
   Year = {2003} }



@inproceedings{
3D:DON88,
   Author = {Dongseung, Kim},
   Title = {Generalized Processor-Memory Bandwidth Analysis For Fully Shared-Memory Multiprocessors},
   BookTitle = {Signals, Systems and Computers, 1988. Twenty-Second Asilomar Conference on },
   Volume = {2},
   Pages = {617-621},
   Abstract = {n/a},
      Year = {1988} }



@inproceedings{
3D:GCC+03,
   Author = {Garcia, J. and Corbal, J. and Cerda, L. and Valero, M.},
   Title = {Design and implementation of high-performance memory systems for future packet buffers},
   BookTitle = {Microarchitecture, 2003. MICRO-36. Proceedings. 36th Annual IEEE/ACM International Symposium on },
   Pages = {372-384},
   Abstract = {In this paper, we address the design of a future high-speed router that supports line rates as high as OC-3072 (160 Gb/s), around one hundred ports and several service classes. Building such a high-speed router would raise many technological problems, one of them being the packet buffer design, mainly because in router design it is important to provide worst-case bandwidth guarantees and not just average-case optimizations. A previous packet buffer design provides worst-case bandwidth guarantees by using a hybrid SRAM/DRAM approach. Next-generation routers need to support hundreds of interfaces (i.e., ports and service classes). Unfortunately, high bandwidth for hundreds of interfaces requires the previous design to use large SRAMs which become a bandwidth bottleneck. The key observation we make is that the SRAM size is proportional to the DRAM access time but we can reduce the effective DRAM access time by overlapping multiple accesses to different banks, allowing us to reduce the SRAM size. The key challenge is that to keep the worst-case bandwidth guarantees, we need to guarantee that there are no bank conflicts while the accesses are in flight. We guarantee bank conflicts by reordering the DRAM requests using a modern issue-queue-like mechanism. Because our design may lead to fragmentation of memory across packet buffer queues, we propose to share the DRAM space among multiple queues by renaming the queue slots. To the best of our knowledge, the design proposed in this paper is the fastest buffer design using commodity DRAM to be published to date.},
   Keywords = {DRAM chips
SRAM chips
buffer storage
memory architecture
packet switching
routing protocols
DRAM
SRAM
access time
issue-queue-like mechanism
memory systems
multiple queues
optimizations
packet buffer design
packet buffers
queue slots
router design
worst-case bandwidth},
   Year = {2003} }



@inproceedings{
3D:GEB01,
   Author = {Gebotys, C. H.},
   Title = {Utilizing memory bandwidth in DSP embedded processors},
   BookTitle = {Design Automation Conference, 2001. Proceedings },
   Pages = {347-352},
   Abstract = {This paper presents a network flow approach to solving the register binding and allocation problem for multi word memory access DSP processors. In recently announced DSP processors, such as Star*core, sixteen bit instructions which simultaneously access four words from memory are supported. A polynomial-time network flow methodology is used to allocate multiword accesses while minimizing code size. Results show that improvements of up to 87% in terms of memory bandwidth (and up to 30% reduction in energy dissipation) are obtained compard to compiler-generated DSP code. This research is important for industry since this value-added technique can increase memory bandwidths and minimize code size without increasing cost.},
   Keywords = {digital signal processing chips
embedded systems
optimising compilers
processor scheduling
storage allocation
DSP embedded processors
Star*core
allocation problem
code size
memory bandwidth
memory bandwidths
multi word memory access
network flow approach
polynomial-time network flow methodology
register binding
value-added technique},
   Year = {2001} }



@inproceedings{
3D:GEB01,
   Author = {Gebotys, C. H.},
   Title = {Utilizing memory bandwidth in DSP embedded processors},
   BookTitle = {Design Automation Conference, 2001. Proceedings },
   Pages = {347-352},
   Abstract = {This paper presents a network flow approach to solving the register binding and allocation problem for multi word memory access DSP processors. In recently announced DSP processors, such as Star*core, sixteen bit instructions which simultaneously access four words from memory are supported. A polynomial-time network flow methodology is used to allocate multiword accesses while minimizing code size. Results show that improvements of up to 87% in terms of memory bandwidth (and up to 30% reduction in energy dissipation) are obtained compared to compiler-generated DSP code. This research is important for industry since this value-added technique can increase memory bandwidths and minimize code size without increasing cost.},
   Keywords = {digital signal processing chips
embedded systems
optimising compilers
processor scheduling
storage allocation
DSP embedded processors
Star*core
allocation problem
code size
memory bandwidth
memory bandwidths
multi word memory access
network flow approach
polynomial-time network flow methodology
register binding
value-added technique},
   Year = {2001} }



@article{
3D:GV97,
   Author = {Gillingham, P. and Vogley, B.},
   Title = {SLDRAM: high-performance, open-standard memory},
   Journal = {Micro, IEEE},
   Volume = {17},
   Number = {6},
   Pages = {29-39},
   Note = {0272-1732},
   Abstract = {The primary objective of DRAM-dynamic random access memory-is to offer the largest memory capacity at the lowest possible cost. Designers achieve this by two means. First, they optimize the process and the design to minimize die area. Second, they ensure that the device serves high-volume markets and can be mass-produced to achieve the greatest economies of scale. SLDRAM-synchronous-link DRAM-is a new memory interface specification developed through the cooperative efforts of leading semiconductor memory manufacturers and high-end computer architects and system designers. SLDRAM meets the high data bandwidth requirements of emerging processor architectures and retains the low cost of earlier DRAM interface standards. These and other benefits suggest that SLDRAM will become the mainstream commodity memory of the early 21st century},
   Keywords = {DRAM chips
technological forecasting
SLDRAM
dynamic random access memory
mainstream commodity memory
memory interface
open-standard memory},
   Year = {1997} }



@inproceedings{
3D:GOE02,
   Author = {Goetz, M.},
   Title = {System on chip design methodology applied to system in package architecture},
   BookTitle = {Electronic Components and Technology Conference, 2002. Proceedings. 52nd },
   Pages = {254-258},
   Abstract = {There are two competing technologies pursuing the 'holy grail' of complete system integration. Today, the most common method used to create the 'system' is to mount separately packaged ICs on a next-level substrate. Even with a low pin count, a package is typically several times larger than the IC, to accommodate the low wiring density on the PCB. High-performance systems, such as network processor systems, require high data bandwidth between key components and thus need an increased number of signal I/Os. Wide I/O busses switching at high speeds consequently require a larger nuber of power and ground pins to reduce switching noise. As a result, system performance is limited by increasing package size and the associated parasitic inductance and capacitance of the package and its connection on the PCB. System on chip (SoC) architecture attempts to integrate many functions, both analog and digital into a monolithic device. The successes are many, but so are the challenges. Many functions cannot be optimized due to the limitation of the semiconductor substrate used. Also, as defect density scales with area, the notion of integrating large scale functions (memory, switch fabrics) with small scale functions (rf devices) results in compounded yield impacts. System in Package (SiP) technology allows heterogeneous devices to be integrated into a small form factor. The integration technology includes embedded devices in the substrate and 3 dimensional chip-stacking approaches. By using a silicon based SiP, a copper/low K interconnect defined by lithographic processes on silicon offers very dense routing with high speed, low noise signal paths. The ICs used in the SiP can be designed to leverage the high density interconnect by optimizing both the core and the I/O of each device. Additionally, specialized devices can be designed specifically for the SiP architecture to take advantage of the high bandwidth and low latency features. Reducing chip-to-chip bus capacitance can dramatically decrease system power requirements and thermal dissipation. The lower bus power can be traded against higher bus frequency to improve performance at a fixed power level.},
   Keywords = {integrated circuit design
integrated circuit packaging
IC packaging
bus standard
core-based interface
embedded device
form factor
high density interconnect
system-in-package architecture
system-on-chip design methodology
three-dimensional chip stacking},
   Year = {2002} }



@inproceedings{
3D:GMV+04,
   Author = {Gomez, J. I. and Marchal, P. and Verdoorlaege, S. and Pinuel, L. and Catthoor, L.},
   Title = {Optimizing the memory bandwidth with loop morphing},
   BookTitle = {Application-Specific Systems, Architectures and Processors, 2004. Proceedings. 15th IEEE International Conference on },
   Pages = {213-223},
   Abstract = {The memory bandwidth largely determines the performance of embedded systems. However, very often compilers ignore the actual behavior of the memory architecture, causing large performance loss. To better utilize the memory bandwidth, several researchers have introduced instruction scheduling/data assignment techniques. Because they only optimize the bandwidth inside each basic block, they often fail to use all available bandwidth. Loop fusion is an interesting alternative to more globally optimize the memory access schedule. By fusing loops we increase the number of independent memory operations inside each basic block. The compiler can then better exploit the available bandwidth and increase the system's performance. However, existing fusion techniques can only combine loops with a conformable header. To overcome this limitation we present loop morphing; we combine fusion with strip mining and loop splitting. We also introduce a technique to steer loop morphing such that we find a compact memory access schedule. Experimental results show that with our approach we can decrease the execution time up to 88%.},
   Keywords = {bandwidth allocation
embedded systems
memory architecture
optimisation
program compilers
program control structures
scheduling
data assignment
embedded systems
instruction scheduling
loop fusion
loop morphing
loop splitting
memory access scheduling
memory architecture
memory bandwidth optimization
strip mining},
   Year = {2004} }



@inproceedings{
3D:GS03,
   Author = {Goplen, B. and Sapatnekar, S.},
   Title = {Efficient thermal placement of standard cells in 3D ICs using a force directed approach},
   BookTitle = {ICCAD},
   Pages = {86-89},
   Abstract = {As the technology node progresses, thermal problems are becoming more prominent espeially in the developing technology of three-dimensional (3D) integrated circuits. The thermal placement method presented in this paper uses an iterative force-directed approach in which thermal forces direct cells away from areas of high temperature. Finite element analysis (FEA) is used to calculate temperatures efficiently during each iteration. Benchmark circuits produce thermal placements with both lower temperatures and thermal gradients while wirelength is minimally affected.},
   Keywords = {cellular arrays
finite element analysis
integrated circuit design
temperature distribution
3D IC
FEA
benchmark circuits
finite element analysis
iterative force directed approach
standard cells
temperature distribution
thermal gradients
thermal placement
wirelength},
   Year = {2003} }



@inproceedings{
3d-GS05,
   Author = {Goplen, B. and Sapatnekar, S. S.},
   Title = {Thermal Via Placement in 3D ICs,},
   BookTitle = {International Symposium on Physical Design},
   Pages = {167-174},
      Year = {2004} }



@article{
   Author = {Goplen, B. and Sapatnekar, S. S.},
   Title = {Placement of Thermal Vias in 3D ICs using Various Thermal Objectives},
   Journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
      Year = {2005} }



@inproceedings{
3D:HMS+06,
   Author = {Hao, Hua and Mineo, C. and Schoenfliess, K. and Sule, A. and Melamed, S. and Jenkal, R. and Davis, W. R.},
   Title = {Exploring compromises among timing, power and temperature in three-dimensional integrated circuits},
   BookTitle = {DAC},
   Pages = {997-1002},
   Abstract = {Three-dimensional integrated circuits (3DICs) have the potential to reduce interconnect lengths and improve digital system performance. However, heat removal is more difficult in 3DICs, and the higher temperatures increase delay and leakage power, potentially negating the performance improvement. Thermal vias can help to remove heat, but they create routing congestion, which also leads to longer interconnects. It is therefore very difficult to tell whether or not a particular system may benefit from 3D integration. In order to help understand this trade-off, physical design experiments were performed on a low-power and a high-performance design in an existing 3DIC technology. Each design was partitioned and routed with varying numbers of tiers and thermal-via densities. A thermal-analysis methodology is developed to predict the final performance. Results show that the lowest energy per operation and delay are achieved with 4 or 5 tiers. These results show a reduction in energy and delay of up to 27% and 20% compared to a traditional 2DIC approach. In addition, it is shown that thermal-vias offer no performance benefit for the low-power system and only marginal benefit for the high-performance system.},
   Keywords = {integrated circuit design
integrated circuit interconnections
low-power electronics
network routing
3D integrated circuits
digital system performance improvement
heat removal
interconnect length reduction
leakage power
routing congestion
temperature dependency
thermal analysis
thermal vias
3DIC
Design
Experimentation
Performance
design flow
temperature dependency
trade off},
   Year = {2006} }



@article{
3D:HLV84,
   Author = {Hoefflinger, B. and Liu, S. T. and Vajdic, B.},
   Title = {A three-dimensional CMOS design methodology},
   Journal = {Electron Devices, IEEE Transactions on},
   Volume = {31},
   Number = {2},
   Pages = {171-173},
   Note = {0018-9383},
   Abstract = {A technology-updatable design methodology for three-dimensional (3-D) CMOS circuits has been developed. Four levels of abstraction have been implemented with topographical congruence: 1) technology level, 2) mask level, 3) transistor level, and 4) logic level. A novel transistor level symbolic representation is introduced which emphasizes the three-dimensional nature of the circuits. A number of design examples is presented.},
      Year = {1984} }



@inproceedings{
3D:HLY+06,
   Author = {Hung, W. L. and Link, G. M. and Y. Xie and V. Narayanan and Irwin, M. J.},
   Title = {{Interconnect and thermal-aware floorplanning for 3D microprocessors}},
   BookTitle = {International Symposium on Quality Electronic Design },
   Abstract = {Interconnects are becoming an increasing problem from both performance and power consumption perspective in future technology nodes. The introduction of 3D chip architectures, with their intrinsic capability of reducing wire length, is one of the promising solutions to mitigate the interconnect problem. While interconnect power consumption reduces due to the adoption of 3D designs, the stacking of multiple active layers leads to higher power densities. Thus, high peak temperatures are of major concern in 3D designs. Consequently, we present a thermal-aware floorplanner for 3D architectures. In contrast to most prior work, our floorplanner considers the interconnect power consumption in exploring a thermal-aware floorplan. Our results show that excluding interconnect power can result in peak temperatures being underestimated by as much as 15/spl deg/C in 90nm technology. Finally, we demonstrate that our floorplanner is effective in lowering peak temperatures using a microprocessor design and four MCNC designs as benchmarks.},
   Keywords = {integrated circuit interconnections
integrated circuit layout
logic design
microprocessor chips
3D chip architectures
3D microprocessors
90 nm
interconnect power consumption
interconnect-aware floorplanning
thermal-aware floorplanning
wire length},
   Year = {2006} }



@book{
3D:JAB03,
   Author = {J.Rabaey and A.Chandrakasan and B.Nikolic},
   Title = {Digital Integrated Circuits: A Design Perspective, second edition},
   Publisher = {Prentice Hall, NJ},
      Year = {2003} }



@article{
3d-JEZ05,
   Author = {Jacob, P and Erdogan, O. and Zia, A. and Belemjian, P.M. and Kraft, R.P and McDonald, J.F.},
   Title = {Predicting the Performance of a 3D Processor-memory Chip Stack},
   Journal = {IEEE Design and Test of Computers},
   Volume = {22},
   Number = {6},
   Pages = {540- 547},
   Abstract = {We are exploring a 3D processor-memory stack for use with the message passing interface (MPI). The communication among processors in huge servers wastes several thousands of cycles. Most of these wasted cycles do not come from the communication link among the processors across the system, but rather in handling the message packets. A processor that could handle this message packing and communication at a much faster rate could significantly increase this task's efficiency and thus increase the utilization of such supercomputers, currently a very low 1%. However, at such high clock rates, the memory wall would become a significant problem. Tackling this problem requires innovative technologies, such as 3D memories, which alleviate some problems with long on-chip interconnects. The importance of interconnection wires to circuit performance is on a chip. The need for shorter interconnection delays suggests shorter interconnection wires. Shorter interconnections are more likely in 3D architectures than in equivalent 2D systems. This article explores the advantages of 3D in a processor-memory stack system. We conducted simulations using simple tools like Dinero IV and the cache access and cycle time information (Cacti) to evaluate the performances of various memory architectures.},
      Year = {2005} }



@inproceedings{
3D:JSV03,
   Author = {Jahangir, Hasan and Satish, Chandra and Vijaykumar, T. N.},
   Title = {Efficient use of memory bandwidth to improve network processor throughput},
   BookTitle = {Computer Architecture, 2003. Proceedings. 30th Annual International Symposium on },
   Pages = {300-311},
   Abstract = {We consider the efficiency of packet buffers used in packet switches built using network processors (NPs). Packet buffers are typically implemented using DRAM, which provides plentiful buffering at a reasonable cost. The problem we address is that a typical NP workloa may be unable to utilize the peak DRAM bandwidth. Since the bandwidth of the packet buffer is often the bottleneck in the performance of a shared-memory packet switch, inefficient use of available DRAM bandwidth further reduces the packet throughput. Specialized hardware-based schemes that alleviate the DRAM bandwidth problem in high-end routers may be less applicable to NP-based systems, in which cost is an important consideration. We propose cost-effective ways to enhance average-case DRAM bandwidth. In modern DRAMs, successive accesses falling within the same DRAM row are significantly faster than those falling across rows. If accesses to DRAM can be generated differently or reordered to take advantage of fast same-row accesses, peak DRAM bandwidth can be approached. The challenge is in exploiting this "row locality" despite the unpredictable nature of memory accesses in NPs. We propose a set of simple techniques to meet this challenge. These include locality-sensitive buffer allocation on packet input, reordering DRAM accesses to increase locality, and prefetching to reduce row miss penalty. We evaluate our techniques on cycle-accurate simulations of Intel's IXP 1200 network processor and find that they boost packet throughput on average by 42.7%, utilizing nearly the peak DRAM bandwidth, for a set of common NP applications processing a real trace.},
   Keywords = {DRAM chips
bandwidth allocation
buffer storage
multi-threading
packet switching
shared memory systems
DRAM bandwidth
Intel IXP 1200 NP
NP applications
NP throughput
locality-sensitivity buffer allocation
memory access
network processor
packet buffers
row locality
shared memory packet switching},
   Year = {2003} }



@inproceedings{
3D:JM02,
   Author = {Joyner, J. W. and Meindl, J. D.},
   Title = {Opportunities for reduced power dissipation using three-dimensional integration},
   BookTitle = {Interconnect Technology Conference},
   Pages = {148-150},
   Abstract = {The opportunities for reducing power dissipation using three-dimensional integration, particularly the power needed to switch the interconnects, are investigated. In a three-dimensional implementation, both the gate pitch and the total interconnect length in gate pitches can be reduced from the values required in a two-dimensional implementation. The simultaneous scaling of these two values leads to an overall reduction in the interconnect power by roughly a factor of the square root of the number of strata. For example, use of four strata leads to roughly a 50% reduction in total interconnect power. The reduction in interconnect lengths leads to smaller interconnect capacitances, offering the opportunity to lower transistor power as well.},
   Keywords = {VLSI
capacitance
integrated circuit interconnections
integrated circuit layout
low-power electronics
3D integration
gate pitch reduction
interconnect capacitances
interconnect design optimization
interconnect power reduction
interconnect switching
power dissipation reduction
simultaneous scaling
three-dimensional integration
total interconnect length reduction},
   Year = {2002} }



@article{
3D:JVZ+01,
   Author = {Joyner, J. W. and Venkatesan, R. and Zarkesh-Ha, P. and Davis, J. A. and Meindl, J. D.},
   Title = {Impact of three-dimensional architectures on interconnects in gigascale integration},
   Journal = {Very Large Scale Integration (VLSI) Systems, IEEE Transactions on},
   Volume = {9},
   Number = {6},
   Pages = {922-928},
   Note = {1063-8210},
   Abstract = {An interconnect distribution model for homogeneous, three-dimensional (3-D) architectures with variable separation of strata is presented. Three-dimensional architectures offer an opportunity to reduce the length of the longest interconnects. The separation of strata has little impact on the length of interconnects but a large impact on the number of interstratal interconnects. Using a multilevel interconnect methodology for an ITRS 2005 100 nm ASIC, a two-strataarchitecture offers a 3.9&times; increase in wire-limited clock frequency, an 84% decrease in wire-limited area or a 25% decrease in the number of metal levels required. In practice, however, such fabrication advances as improved alignment tolerances in wafer-bonding techniques are needed to gain key advantages stemming from 3-D architectures for homogeneous gigascale integrated circuits},
   Keywords = {VLSI
application specific integrated circuits
integrated circuit interconnections
integrated circuit layout
logic partitioning
multivalued logic circuits
ASIC
gigascale integration
homogeneous architecture
homogeneous logic blocks
improved alignment tolerances
interconnect distribution model
multilevel interconnect methodology
number of metal levels
three-dimensional architecture
two-strata architecture
variable separation of strata
wafer-bonding techniques
wire-limited area
wire-limited clock frequency},
   Year = {2001} }



@article{
3D:KAT97,
   Author = {Katayama, Y.},
   Title = {Trends in semiconductor memories},
   Journal = {Micro, IEEE},
   Volume = {17},
   Number = {6},
   Pages = {10-17},
   Note = {0272-1732},
   Abstract = {Despite their great market success, DRAMs have not kept pace with microprocessor improvements, so researchers are looking to advanced high-speed DRAM and merged DRAM/logic technologies to increase memory system performance},
   Keywords = {DRAM chips
semiconductor storage
technological forecasting
DRAMs
high-speed DRAM
memory system performance
merged DRAM/logic
semiconductor memories},
   Year = {1997} }



@inproceedings{
3D:KZR04,
   Author = {Khalid, A. U. and Zilic, Z. and Radecka, K.},
   Title = {FPGA emulation of quantum circuits},
   BookTitle = {Computer Design: VLSI in Computers and Processors, 2004. ICCD 2004. Proceedings. IEEE International Conference on},
   Pages = {310-315},
   Abstract = {Quantum computing offers immense speedup in performing tasks such as data encryption and searching. The quantum algorithms can be modeled using classical computing devices, however classical computer simulations cannot deal efficiently with the parallelism present in quantum algorithms. The quantum circuit model for quantum algorithms is sufficient to describe the known quantum algorithms. Using analogies between quantum and digital circuits, we design the emulator of quantum algorithms in FPGAs that allows efficient experimentation with new quantum algorithms. This paper concentrates on new techniques for modeling quantum circuits, including the entanglement and probabilistic computing realization, as well as the critical issues in the required precision of computing.},
   Keywords = {field programmable gate arrays
logic design
probability
quantum gates
FPGA emulator design
digital circuits
probability
quantum algorithm
quantum circuit model
quantum computing
quantum gates},
   Year = {2004} }



@article{
3D:KVK+05,
   Author = {Kim, S. and Vijaykrishnan, N. and Kandemir, M. and Irwin, M. J.},
   Title = {Exploiting temporal loads for low latency and high bandwidth memory},
   Journal = {Computers and Digital Techniques, IEE Proceedings-},
   Volume = {152},
   Number = {4},
   Pages = {457-466},
   Note = {1350-2387},
   Abstract = {Increasing clock frequencies and issue rates aggravates the memory latency problem, imposing higher memory bandwidth requirements. While caches can be multi-ported to provide high memory bandwidth, the increase in access latency with the increase in the number of ports limits their potential. The paper proposes a novel technique, called the 'temporal load cache architecture', to reduce load latencies and provide higher memory bandwidths. The key motivation for the technique is that temporal loads - dynamic instances of a static load instruction that access the same address as that accessed by the last dynamic instance of the same static load - constitute 48% of all dynamic loads on average for the SPEC2000 benchmarks. When a load is predicted to be temporal, the data predictd to be accessed by it are read early in the pipeline from a small temporal load cache that stores the temporal data. The proposed temporal load cache architecture has two main advantages. First, since instructions dependent on a temporal load are provided with their data early in the pipeline, they can be issued as soon as they resolve their remaining data dependences and resource conflicts. Second, since a large percentage of loads can be filtered by the temporal load cache, the main data cache can service other (nontemporal) loads better, providing higher memory bandwidth. The experimental results show that the proposed temporal load cache architecture improves performance by 8.3% on average for the SPEC2000 integer benchmarks.},
   Keywords = {cache storage
memory architecture
pipeline processing
SPEC2000 benchmarks
clock frequencies
high bandwidth memory
load latencies
low latency
main data cache
memory latency problem
small temporal load cache
static load instruction
temporal load cache architecture
temporal loads},
   Year = {2005} }



@inproceedings{
3D:KMW+03,
   Author = {Klumpp, A. and Merkel, R. and Wieland, R. and Ramm, P.},
   Title = {Chip-to-wafer stacking technology for 3D system integration},
   BookTitle = {Electronic Components and Technology Conference, 2003. Proceedings. 53rd },
   Pages = {1080-1083},
   Abstract = {N/A},
      Year = {2003} }



@inproceedings{
3D:KK05,
   Author = {Koukis, E. and Koziris, N.},
   Title = {Memory bandwidth aware scheduling for SMP cluster nodes},
   BookTitle = {Parallel, Distributed and Network-Based Processing, 2005. PDP 2005. 13th Euromicro Conference on },
   Pages = {187-196},
   Abstract = {Clusters of SMPs are becoming increasingly common. However, the shared memory design of SMPs and the consequential contention between system processors for access to main memory can limit their efficiency significantly. Moreover, the continuous improvement of modern cluster interconnection technologies leads to the network bandwidth being a significant fraction of the total memory bandwidth of the machine, thus the NIC of an SMP cluster node can also become a major consumer of shared memory bus bandwidth. In this paper we first provide experimental evidence that contention on the shared memory bus can have major impact on the total execution time of processes even when no processor sharing is involved, then present the design and implementation of an informed scheduling algorithm for multiprogrammed workloads, which tries to carefully select processes to be coscheduled so that bus saturation is avoided. The input data needed by our scheduler are acquired dynamically, at run-time, using architecture-specific performance monitoring counters and a modified version of the NIC firmware, with no changes to existing application binaries. Experimental comparison between our scheduler and the standard Linux 2.6 O(1) scheduler shows average system throughput improvements in the range of 5-25%.},
   Keywords = {bandwidth allocation
network interfaces
scheduling
shared memory systems
storage allocation
system buses
workstation clusters
Linux 2.6
NIC
NIC firmware
SMP cluster node interconnection technologies
architecture-specific performance monitoring counters
multiprogrammed workloads
scheduling algorithm
shared memory bus bandwidth},
   Year = {2005} }



@article{
3D:KKB04,
   Author = {K. Chen and Kobrinsky, M. J. and Barnett, B. C. and Reif, R.},
   Title = {Comparisons of conventional, 3-D, optical, and RF interconnects for on-chip clock distribution},
   Journal = {Electron Devices, IEEE Transactions on},
   Volume = {51},
   Number = {2},
   Pages = {233-239},
   Note = {0018-9383},
   Abstract = {This paper analyzes the performance of different interconnect technologies for on-chip clock distribution, including conventional, three-dimensional, optical, and radio frequency interconnects. Skew, power, and area usage were estimated for each of these technologies based on the 2001 International Technoloy Roadmap for Semiconductors. Our results indicate that most of the skew and power are associated with local clock distribution. Consequently, since the alternative clock distribution approaches that have been proposed focus on global clock distribution, we have not found significant advantages over conventional clock distribution in terms of skew and power. Furthermore, it was found that low skews could be attained with conventional clock distribution schemes if the clock signals are not scaled down.},
   Keywords = {clocks
integrated circuit interconnections
integrated circuit modelling
optical interconnections
system-on-chip
timing circuits
2001 International Technology Roadmap for Semiconductors
3D interconnects
RF interconnects
area usage
conventional interconnects
global clock distribution
interconnect technologies
local clock distribution
on-chip clock distribution
optical interconnects
power usage
radio frequency interconnects
skew},
   Year = {2004} }



@article{
3D:KOI95,
   Author = {Kumanoya, M. and Ogawa, T. and Inoue, K.},
   Title = {Advances in DRAM interfaces},
   Journal = {Micro, IEEE},
   Volume = {15},
   Number = {6},
   Pages = {30-36},
   Note = {0272-1732},
   Abstract = {New advanced architectures in DRAM interfaces seek to close the ever-widening performance gap between DRAM and microprocessor and to break the bandwidth bottleneck in graphics systems. We present an overview of five of these interfaces: EDO, SDRAM, RDRAM, CDRAM, and 3D-RAM. EDO will soon replace conventional DRAM, and SDRAM will partly take over in 66-MHz and higher frequency systems. Other interfaces will initially find target markets that exploit their unique features, and then seek wider market acceptance. Eventually, advances in DRAM will contribute to the trend toward a system on a chip},
   Keywords = {DRAM chips
peripheral interfaces
system buses
3D-RAM
CDRAM
DRAM interfaces
EDO
RDRAM
SDRAM
graphics systems
performance gap},
   Year = {1995} }



@inproceedings{
3D:KNH+04,
   Author = {Kun-Bin, Lee and Nelson Yen-Chung, Chang and Hao-Yun, Chin and Hui-Cheng, Hsu and Chein-Wei, Jen},
   Title = {A bandwidth and memory efficient MPEG-4 shape encoder},
   BookTitle = {Design Automation Conference, 2004. Proceedings of the ASP-DAC 2004. Asia and South Pacific },
   Pages = {525-526},
   Abstract = {We have developed a shape encoder hardware for MPEG-4 video coding. On the one hand, the alpha component is compressed and therefore, the size and memory access of alpha frame memory can be reduced to 50% and 56.25% respectively. On the other hand, an efficient data transfer scheme combining the run length coding and addressing mode can reduce average data transfer time to 9.39% and accelerate the shape encoding process. The shape encoder can support MPEG-4 main profile at Level 4 in real-time. In addition, verification and testing methods are also considered.},
   Keywords = {bandwidth compression
encoding
runlength codes
video coding
MPEG-4 main profile
MPEG-4 shape encoder hardware
MPEG-4 video coding
alpha frame memory
bandwidth efficient
data transfer scheme
memory efficient
run length coding},
   Year = {2004} }



@article{
3D:LJB+99,
   Author = {Lea, R. M. and Jalowiecki, I. P. and Boughton, D. K. and Yamaguchi, J. S. and Pepe, A. A. and Ozguz, V. H. and Carson, J. C.},
   Title = {A 3-D stacked chip packaging solution for miniaturized massively parallel processing},
   Journal = {Advanced Packaging, IEEE Transactions on [see also Components, Packaging and Manufacturing Technology, Part B: Advanced Packaging, IEEE Transactions on]},
   Volume = {22},
   Number = {3},
   Pages = {424-432},
   Note = {1521-3323},
   Abstract = {The development and evaluation of a three-dimensional (3-D) interconnect and packaging technology for massively parallel processor (MPP) implementation is reported. Following reviews of specific modular massively parallel computer (MPC) accelerator and chip stacking technologies, the paper reports the progress of a colaborative research project to pioneer a novel MPP module. The design of a highly compact 3-D chip-stack, integrating five MPP chips in a single package, is described in detail. Problems encountered and their solutions are reported. Test results for prototype MPP chip-stacks provide proof-of-principle for the 3-D chip stacking approach. Allowing from 2:1 to 4:1 savings in the modular MPC implementation size, without significant increase in cost or loss of performance, the emerging MPP chip stacking technology offers a cost-effective solution for MPP miniaturization},
   Keywords = {VLSI
associative processing
chip scale packaging
fault tolerant computing
integrated circuit interconnections
multichip modules
parallel architectures
reconfigurable architectures
3-D interconnect
3-D stacked chip packaging solution
MPP module
SIMD MPP
VLSI
associative string processing
cost-effective solution
defect/fault tolerance
high density interconnect
highly compact 3-D chip-stack
miniaturized massively parallel processing
modular MPC implementation size
reconfigurable data parallel computing},
   Year = {1999} }



@article{
3D:LMV+98,
   Author = {Leeser, M. and Meleis, W. M. and Vai, M. M. and Chiricescu, S. and Weidong, Xu and Zavracky, P. M.},
   Title = {Rothko: a three-dimensional FPGA},
   Journal = {Design and Test of Computers, IEEE},
   Volume = {15},
   Number = {1},
   Pages = {16-23},
   Note = {0740-7475},
   Abstract = {Using transferred circuits and metal interconnections placed between layers of active devices anywhere on the chip, Rothko aims at solving utilization, routing, and delay problems of existing FPGA architectures. Experimental implementations have demonstrated important performance advantages},
   Keywords = {circuit layout CAD
field programmable gate arrays
logic CAD
FPGA
Rothko
active devices
delay problems
metal interconnections
routing
transferred circuits
utilization},
   Year = {1998} }



@article{
3D:LLH+03,
   Author = {Lei, Xue and Liu, C. C. and Hong-Seung, Kim and Kim, S. K. and Tiwari, S.},
   Title = {Three-dimensional integration: technology, use, and issues for mixed-signal applications},
   Journal = {IEEE Transactions on Electron Devices},
   Volume = {50},
   Number = {3},
   Pages = {601-609},
   Note = {0018-9383},
   Abstract = {Three-dimensional (3-D) integration provides opportunities in large-scale integration of mixed-signal and general system-on-chip applications with improved performance, through increased density and mixing of different active and passive technologies. This paper reports a novel low-thermal-budget 3-D fabrication technique-multilayers with buried structures (MLBS) and an analysis of its applicability to mixed-signal integration. The MLBS technique uses a low temperature of 450/spl deg/C to transfer a single-crystal silicon layer over a processed wafer consisting of buried in-plane and out-of-plane interconnects obtained through a dual Damascene process. Devices can continue to be processed on this transferred layer. Electrical characteristics of MOS capacitors (D/sub it/=4.7/spl times/10/sup 10/ cm/sup -2/ eV/sup -1/) and 3-D integrated planar CMOS transistors (3-D CMOS), fabricated using MLBS, are consistent with integration requirements. Our analog analysis includes an investigation of thermal effects important to analog applications with continuous operation of transistors in forward active bias, as well as of the coupling isolation derived from use of a ground-plane. Use of high density local interconnectivity improves the thermal properties of 3-D CMOS over that of silicon-on-insulator, and use of a ground plane is shown to lead to an improvement of better than 8 dB in coupling isolation.},
   Keywords = {CMOS integrated circuits
VLSI
integrated circuit interconnections
integrated circuit technology
large scale integration
mixed analogue-digital integrated circuits
silicon
system-on-chip
3D integrated planar CMOSFETs
3D integration technology
450 C
CMOS ICs
MLBS technique
MOS capacitrs
Si
SoC applications
analog applications
buried in-plane interconnects
coupling isolation
crosstalk
dual Damascene process
forward active bias operation
ground-plane
high density local interconnectivity
large-scale integration
low-thermal-budget 3D fabrication technique
mixed-signal applications
multilayers with buried structures
out-of-plane interconnects
planar CMOS transistors
processed wafer
self-heating
single-crystal Si layer transfer
system-on-chip applications
thermal effects
three-dimensional integration},
   Year = {2003} }



@inproceedings{
3D:LLT01,
   Author = {Lei, Xue and Liu, C. C. and Tiwari, S.},
   Title = {Multi-layers with buried structures (MLBS): an approach to three-dimensional integration},
   BookTitle = { IEEE International SOI Conference},
   Pages = {117-118},
   Abstract = {A new multi-layers with buried structures (MLBS) approach that is suitable for three-dimensional integration is described. The silicon layering technique uses temperatures as low as 450 C, comparable to some of the steps in back-end-of-line processing of CMOS and provides a solution to temperature constraints in integration},
   Keywords = {CMOS integrated circuits
buried layers
multilayers
silicon-on-insulator
3D integration
450 C
CMOS
MLBS
SOI
Si
back-end-of-line processing
layering technique
multilayers with buried structures approach
temperature constraints},
   Year = {2001} }



@article{
3D:LIH80,
   Author = {Li, H. F.},
   Title = {Bandwidth of fast memory in multiprocessing},
   Journal = {Proceedings of the IEEE},
   Volume = {68},
   Number = {5},
   Pages = {630-632},
   Note = {0018-9219},
   Abstract = {A Markov chain analysis is presented for evaluating the average memory bandwidth of a multiple microprocessor system where the random access memory modules have a cycle time shorter than the processor cycle time. The results are useful in the design of such systems, as well as in the understanding of the effects of such processor-memory speed mismatches.},
      Year = {1980} }



@article{
3d-Lim,
   Author = {Lim, S.K.},
   Title = {Physical Design for 3D System on Package},
   Journal = {IEEE Design and Test of Computers},
   Volume = {22},
   Number = {6},
   Pages = {532- 539},
   Abstract = {The SoC paradigm is a system integration approach that integrates large numbers of transistors as well as various mixed-signal active and passive components onto a single chip. This realization-led to the 3D system-in-package (SiP) approach, alternatively called 3D ICs or 3D stacked die/package. Designers can take SiP a step further by embedding both active and passive components, but passive-component embedding is bulky and requires thick-film discrete components. Thick-film component embedding distinguishes SiP from system on package (SoP), an emerging 3D system integration concept that involves embedding both active and passive components. SoP, however, incorporates ultrathin films at microscale to embed the passive components, and the package rather than the board is the system. SoP overcomes both the computing and integration limitations of SoC, SiP, multichip modules (MCMs), and traditional system packaging by having global wiring as well as RF, digital, and optical component integration in the package instead of on the chip. Moreover, 3D SoP addresses the wire delay problem by enabling the replacement of long, slow global interconnects with short, fast vertical routes.},
      Year = {2005} }



@inproceedings{
   Author = {Link, G. M. and Vijaykrishnan, N.},
   Title = {Thermal Trends in Emerging Technologies},
   Pages = {625-632},
   Abstract = {In the future, the peak temperature of a chip will be a primary design constraint. In order to meet this constraint, temperature must be considered in the earliest phases of the design process. Using a newly developed thermal analysis tool, HS3d, this work explores the thermal profile of devices as technology varies. We show that as technology scales, the hotspot locations can sift from the units with the most switching activity to those with the most lowthreshold transistors. We further note that process variations in leakage dominated technologies can result in significant variations in the hotspot locations, indicating that feedback from thermal sensors will be very important. Finally, this work examines the thermal effects of multi-layer device stacking technologies, and finds that the vertical temperature difference between layers is much less significant than the horizontal differences due to power density, and as such, vertical placement optimizations will have much smaller impact on hotspot development than a uniform power distribution.},
      Year = {2006} }



@inproceedings{
3D:LV06,
   Author = {Link, G. M. and Vijaykrishnan, N.},
   Title = {Thermal trends in emerging technologies},
   BookTitle = {International Symposium on Quality Electronic Design},
   Pages = {8 pp.},
   Abstract = {In the future, the peak temperature of a chip will be a primary design constraint. In order to meet this constraint, temperature must be considered in the earliest phases of the design process. Using a newly developed thermal analysis tool, HS3d, this work explores the thermal profile of devices as technology varies. We show that as technology scales, the hotspot locations can shift from the units with the most switching activity to those with the most low-threshold transistors. We further note that process variations in leakage dominated technologies can result in significant variations in the hotspot locations, indicating that feedback from thermal sensors will be very important. Finally, this work examines the thermal effects of multi-layer device stacking technologies, and finds that the vertical temperature difference between layers is much less significant than the horizontal differences due to power density, and as such, vertical placement optimizations will have much smaller impact on hotspot development than a uniform power distribution.},
   Keywords = {integrated circuit design
thermal analysis
thermal management (packaging)
HS3d
chip peak temperature
design constraint
emerging technologies
horizontal differences
hotspot development
hotspot locations
low-threshold transistors
multilayer device stacking technologies
power density
power distribution
thermal analysis tool
thermal sensors
thermal trends
vertical placement optimizations
vertical temperature difference},
   Year = {2006} }



@article{
3d-LGB+05,
   Author = {Liu, C.C. and Ganusov, I. and Burtscher, M. and Tiwari, Sandip},
   Title = {Bridging the Processor-memory Performance Gap with 3D IC Technology},
   Journal = {IEEE Design and Test of Computers},
   Volume = {22},
   Number = {6},
   Pages = {556- 564},
   Abstract = {Microprocessor performance has been improving at roughly 60% per year. Memory access times, however, have improved by less than 10% per year. The resulting gap between logic and memory performance has forced microprocessor designs toward complex and power-hungry architectures that support out-of-order and speculative execution. Moreover, processors have been designed with increasingly large cache hierarchies to hide main memory latency. This article examines how 3D IC technology can improve interactions between the processor and memory. Our work examines the performance of a single-core, single-threaded processor under representative work loads. We have shown that reducing memory latency by bringing main memory on chip gives us near-perfect performance. Three-dimensional IC technology can provide the much needed bandwidth without the cost, design complexity, and power issues associated with a large number of off-chip pins. The principal challenge remains the demonstration of a highly manufacturable 3D IC technology with high yield and low cost.},
      Year = {2005} }

@article{
alpha:krewel,
   Author = {K. Krewel},
   Title = {Alpha ev7 processor: A high-performance tradition continues},
   Journal = {Microprocessor Report},
   Year = {2005} }

@article{
3D:MS94,
   Author = {Mahmud, S. M. and Samaratunga, L. T.},
   Title = {Memory bandwidth analysis of hierarchical multiprocessors using model decomposition and steady-state flowanalysis},
   Journal = {Parallel and Distributed Systems, IEEE Transactions on},
   Volume = {5},
   Number = {5},
   Pages = {553-560},
   Note = {1045-9219},
   Abstract = {For memory bandwidth analysis, researchers generally discard requests that are not accepted during a memory cycle. This assumption simplifies the analysis and produces negligible discrepancies with actual results for a system with a non-hierarchical interconnection network. However, the assumption, &ldquo;the requests that are not occupied during a memory cycle are discarded,&rdquo; cannot be used for a multiprocessor system with a hierarchical interconnection network (HIN), because the error introduced assumption can be several orders of magnitude higher than the actual bandwidth. An improved analytical model to determine the bandwidth of a HIN-based system is presented},
   Keywords = {failure analysis
memory architecture
multiprocessor interconnection networks
performance evaluation
shared memory systems
hierarchical interconnection network
hierarchical multiprocessors
memory bandwidth analysis
memory cycle
model decomposition
steady-state flow analysis},
   Year = {1994} }



@inproceedings{
3D:MAN01,
   Author = {Mansun, Chan},
   Title = {The potential and realization of multi-layers three-dimensional integrated circuit},
   BookTitle = {Solid-State and Integrated-Circuit Technology, 2001. Proceedings. 6th International Conference on },
   Volume = {1},
   Pages = {40-45 vol.1},
   Abstract = {3D IC technology with transistors stacked on top of one another in multiple silicon layers has long been a vision for future technology directions to provide a breakthrough towards higher circuit density and functionality. While the idea is simple, techniques to obtain high performance multi-layer transistors are extraordinarily difficult, and it is only recently that such technology has become feasible. In this paper, various techniques to form 3D circuits are reviewed. Recent development of a promising technology to achieve three-dimensional integration using metal-induced-lateral-crystallization is described. Preliminary results for 3D inverters and ring-oscillators have demonstrated the viability of 3D integrated circuits. Finally, the challenges for realization of 3D circuits are discussed},
   Keywords = {CMOS integrated circuits
crystallisation
elemental semiconductors
integrated circuit metallisation
integrated circuit testing
logic gates
oscillators
silicon
3D IC technology
3D circuit techniques
3D circuit technology
3D integrated circuits
3D integration technology
3D inverters
3D ring-oscillators
Si
circuit density
circuit functionality
metal-induced-lateral-crystallization
multi-layer transistors
multilayer 3D integrated circuits
multiple silicon layers
recrystallization
stacked CMOS structures
stacked transistors
thin-film-transistors
wafer bonding},
   Year = {2001} }



@inproceedings{
3D:MCG04,
   Author = {Marchal, P. and Catthoor, F. and Gomez, J. I.},
   Title = {Optimizing the memory bandwidth with loop fusion},
   BookTitle = {Hardware/Software Codesign and System Synthesis, 2004. CODES + ISSS 2004. International Conference on },
   Pages = {188-193},
   Abstract = {The memory bandwidth largely determines the performance and energy cost of embedded systems. At the compiler level, several techniques improve the memory bandwidth at the scope of a basic block, but often fail to exploit all. We propose a technique to optimize the memory bandwidth across the boundaries of a basic block. Our technique incrementally fuses loops to better use the available bandwidth. The resulting performance depends on how the data is assigned to the memories of the memory layer. At the same time, the assignment also strongly influences the energy cost. Therefore, we combine in our approach the fusion and assignment decisions. Designers can use our output to trade-off the energy cost with the system's performance.},
   Keywords = {embedded systems
optimisation
optimising compilers
proram control structures
storage management
assignment decisions
compiler memory bandwidth
embedded systems
energy cost
loop fusion
low power design
memory bandwidth optimization},
   Year = {2004} }



@article{
3D:MHH+03,
   Author = {Margomenos, A. and Herrick, K. J. and Herman, M. I. and Valas, S. and Katehi, L. P. B.},
   Title = {Isolation in three-dimensional integrated circuits},
   Journal = {Microwave Theory and Techniques, IEEE Transactions on},
   Volume = {51},
   Number = {1},
   Pages = {25-32},
   Note = {0018-9480},
   Abstract = {The necessity for high-density integrated-circuit (IC) design makes the issue of circuit isolation a critical one. In multilayer structures, surface waves excited by planar discontinuities induce parasitic currents on adjoining interconnects, therefore, causing a parasitic coupling, which becomes a limiting factor as density increases and size reduces. This paper presents a study of those proximity effects on interconnect geometry for X- and W-bands. Various configurations of finite ground microstrip lines, finite ground coplanar (FGC) waveguides, and transitions have been analyzed for silicon and Duroid substrates. The results included in this paper illustrate the implications of parasitic coupling associated with interconnects printed in parallel or perpendicular configurations, lines located in close proximity to open-end discontinuities, vias etched near FGC lines, and finally vertical transitions through wafers. Theoretical and experimental results, in terms of reduced isolation, are provided, showing the advantages of the use of FGC lines over conventional microstrip lines since, in the cases investigated, they offer 8 dB higher isolation. Additionally, open-end effects are demonstrated to increase coupling by as much as 6 dB, while vertical transitions through wafers cause a parasitic coupling in the order of 3 dB. The results presented in this study can be employed in order to reduce parasitic interference between interconnects.},
   Keywords = {coplanar waveguides
integrated circuit interconnections
integrated circuit packaging
isolation technology
microstrip lines
microwave integrated circuits
millimetre wave integrated circuits
silicon
3D ICs
Duroid substrates
Si
Si substrates
W-band
X-band
adjoining interconnects
circuit isolation
finite ground CPW
finite ground coplanar waveguides
finite ground microstrip lines
high-density IC design
interconnect geometry
multilayer structures
open-end discontinuities
open-end effects
parallel configurations
parasitic coupling
parasitic currents
parasitic interference reduction
perpendicular configurations
planar discontinuities
proximity effects
surface waves
three-dimensional integrated circuits
transitions
vertical transitions
vias},
   Year = {2003} }



@inproceedings{
3D:MGC+03,
   Author = {Meikei, Ieong and Guarini, K. W. and Chan, V. and Bernstein, K. and Joshi, R. and Kedzierski, J. and Haensch, W.},
   Title = {Three dimensional CMOS devices and integrated circuits},
   BookTitle = {Custom Integrated Circuits Conference, 2003. Proceedings of the IEEE 2003},
   Pages = {207-213},
   Abstract = {Three dimensional devices and, integrated circuits are attractive options for overcoming barriers in device and interconnect scaling, offering an opportunity to continue the CMOS performance trend. This paper reviews the process technology and associated design issues in three dimensional devices and integrated circuits.},
   Keywords = {CMOS integrated circuits
MOSFET
integrated circuit design
integrated circuit interconnections
3D CMOS IC
3D CMOS devices
CMOS performance trend
CMOS process technology
MOSFET devices
interconnect scaling barriers},
   Year = {2003} }



@inproceedings{
3D:NFC+99,
   Author = {Nahman, A. and Fan, A. and Chung, J. and Reif, R.},
   Title = {Wire-length distribution of three-dimensional integrated circuits},
   BookTitle = {Interconnect Technology, 1999. IEEE International Conference },
   Pages = {233-235},
   bstract = {In this paper, the wire (interconnect)-length distribution of three-dimensional (3D) integrated circuits is derived following the methodology used to estimate two-dimensional wire-length distribution (Davis et al, 1998). It is found that 3D integration results in a narrower wire-length distribution with a higher number of local wires and fewer global wires than 2D integration. The impact of 3D integration on system performance is discussed},
   Keywords = {VLSI
circuit CAD
circuit simulation
integrated circuit design
integrated circuit interconnections
integrated circuit metallisation
integrated circuit modelling
2D integration
2D wire-length distribution estimation methodology
3D ICs
3D integrated circuits
3D integration
global wires
interconnect wire-length distribution
local wires
system performance
wire-length distribution},
   Year = {1999} }



@article{
3D:NEU90,
   Author = {Neudeck, G. W.},
   Title = {Three-dimensional CMOS integration},
   Journal = {Circuits and Devices Magazine, IEEE},
   Volume = {6},
   Number = {5},
   Pages = {32-38},
   Note = {8755-3996},
   Abstract = {The advantages of CMOS technology are examined, and problems of and approaches to 3-D integration are discussed. Particular attention is given to silicon-on-insulator (SOI) technology and the use of selective epitaxial growth (SEG) and epitaxial lateral overgrowth (ELO) of monocrystalline silicon. The fabrication of 3-D CMOS devices using these techniques is described},
   Keywords = {CMOS integrated circuits
integrated circuit technology
semiconductor epitaxial layers
semiconductor growth
semiconductor-insulator boundaries
3-D integration
CMOS technology
SOI
epitaxial lateral overgrowth
selective epitaxial growth
silicon-on-insulator},
   Year = {1990} }



@article{
3D:RDC+03,
   Author = {Rahman, A. and Das, S. and Chandrakasan, A. P. and Reif, R.},
   Title = {Wiring requirement and three-dimensional integration technology for field programmable gate arrays},
   Journal = {Very Large Scale Integration (VLSI) Systems, IEEE Transactions on},
   Volume = {11},
   Number = {1},
   Pages = {44-54},
   Note = {1063-8210},
   Abstract = {In this paper, analytical models for predicting interconnect requirements in field-programmable gate arrays (FPGAs) are presented, and opportunities for three-dimensional (3-D) implementation of FPGAs are examined. The analytical models for two-dimensional FPGAs are calibrated by routing and placement experiments with benchmark circuits and extended to 3-D FPGAs. Based on system-level modeling, we find that in FPGAs with more than 20 K four-input look-up tables, the reduction in channel width, interconnect delay and power dissipation can be over 50% by 3-D implementation.},
   Keywords = {VLSI
delay estimation
field programmable gate arrays
integrated circuit interconnections
integrated circuit layout
integrated circuit modelling
integrated circuit technology
logic design
network routing
stochastic processes
3D FPGAs
3D ICs
3D integrated circuit
3D integration technology
Rent's rule
analytical models
channel width reduction
field-programmable gate arrays
four-input look-up tables
interconnect delay
interconnect delay reduction
interconnect requirements
placement experiments
power dissipation reduction
routability prediction
routing experiments
stochastic models
system-level modeling
three-dimensional integration technology
wiring requirement},
   Year = {2003} }



@inproceedings{
3D:RFR00,
   Author = {Rahman, A. and Fan, A. and Reif, R.},
   Title = {Comparison of key performance metrics in two- and three-dimensional integrated circuits},
   BookTitle = {Interconnect Technology Conference, 2000. Proceedings of the IEEE 2000 International },
   Pages = {18-20},
   Abstract = {In this paper some key performance metrics in two-dimensional (2-D) and three-dimensional (3-D) integrated circuits (IC) are estimated for scaled technologies from 250-nm to 50-nm technology nodes using a system-level modelig approach. Considering a microprocessor as an example, projections are made for performance metrics such as clock frequency, chip area, interconnect delay and repeater's number for 2-D and 3-D implementation},
   Keywords = {integrated circuit modelling
microprocessor chips
50 to 250 nm
chip area
clock frequency
interconnect delay
microprocessor
performance metric
repeater number
system-level model
three-dimensional integrated circuit
two-dimensional integrated circuit},
   Year = {2000} }



@article{
3D:RR00,
   Author = {Rahman, A. and Reif, R.},
   Title = {System-level performance evaluation of three-dimensional integrated circuits},
   Journal = {Very Large Scale Integration (VLSI) Systems, IEEE Transactions on},
   Volume = {8},
   Number = {6},
   Pages = {671-678},
   Note = {1063-8210},
   Abstract = {In this paper, the wire (interconnect)-length distribution of three-dimensional (3-D) integrated circuits (ICs) is derived using Rent's rule and following the methodology used to estimate two-dimensional (2-D) (wire-length distribution). Two limiting cases of connectivity between logic gates on different device layers are examined by comparing the wire-length distribution and average and total wire-length. System performance metrics such as clock frequency, chip area, etc., are estimated using wire-length distribution, interconnect delay criteria, and simple models representing the cost or complexity for manufacturing 3-D ICs. The technology requirement for interconnects in 3-D integration is also discussed},
   Keywords = {VLSI
circuit layout CAD
delays
integrated circuit interconnections
integrated circuit layout
integrated circuit modelling
wiring
VLSI
chip area
clock frequency
connectivity
device layers
interconnect delay criteria
interconnect-length distribution
system-level performance evaluation
three-dimensional integrated circuits
total wire-length
wire-length distribution},
   Year = {2000} }



@inproceedings{
3D:RR01,
   Author = {Rahman, A. and Reif, R.},
   Title = {Thermal analysis of three-dimensional (3-D) integrated circuits (ICs)},
   BookTitle = {Interconnect Technology Conference, 2001. Proceedings of the IEEE 2001 International },
   Pages = {157-159},
   Abstract = {In this paper, we examine the thermal issues in 3-D ICs by system-level modeling of power dissipation and analytical and numerical modeling of deviceand package-level heat removal. We find that for comparable system performance in 2-D and 3-D ICs, 20%-25% reduction in power dissipation can be achieved by 3-D integration due to lower capacitance associated with interconnects and clock networks. If the system performance in 3-D ICs is higher (compared to that of 2-D ICs), chip temperature could reach an unacceptable level. The chip temperature is generally limited by the heat removal capability of the packaging technology. To reduce the chip temperature in 3-D ICs for reliable operation of devices and interconnects, innovative package-level cooling technologies will be necessary. Thermal vias, Cu bonding layer for 3-D integration, etc. could also be beneficial for heat removal in 3-D ICs},
   Keywords = {capacitance
cooling
integrated circuit interconnections
integrated circuit modelling
integrated circuit packaging
thermal analysis
3-D ICs
Cu
Cu bonding layer
analytical modeling
capacitance
chip temperature
clock networks
device-level heat removal
heat removal capability
interconnects
numerical modeling
package-level cooling technologies
package-level heat removal
packaging technology
power dissipation
system-level modeling
thermal analysis
thermal vias
three-dimensional integrated circuits},
   Year = {2001} }



@inproceedings{
3D:RFK+02,
   Author = {Reif, R. and Fan, A. and K. Chen and Das, S.},
   Title = {Fabrication technologies for three-dimensional integrated circuits},
   BookTitle = { International Symposium on Quality Electronic Design},
   Pages = {33-37},
   Abstract = {The MIT approach to 3D VLSI integration is basd on low-temperature Cu-Cu wafer bonding. Device wafers are bonded in a face-to-back manner, with short vertical vias and Cu-Cu pads as the inter-wafer throughway. In our scheme, there are several reliability criteria, which include: (a) structural integrity of the Cu-Cu bond; (b) Cu-Cu contact electrical characteristics; and (c) process flow efficiency and repeatability. In addition, CAD tools are needed to aid in design and layout of 3DICs. This paper discusses recent results in all these areas.},
   Keywords = {VLSI
circuit CAD
integrated circuit interconnections
integrated circuit manufacture
integrated circuit metallisation
integrated circuit reliability
interface structure
wafer bonding
3D VLSI
3D integrated circuits
3DIC design
3DIC layout
CAD tools
Cu
Cu-Cu bond structural integrity
Cu-Cu contact electrical characteristics
Cu-Cu pads
fabrication technologies
face-to-back bonded device wafers
inter-wafer throughway
low-temperature Cu-Cu wafer bonding
process flow efficiency
process flow repeatability
reliability criteria
short vertical vias},
   Year = {2002} }



@article{
3D:RES93,
   Author = {Rose, J. and El Gamal, A. and Sangiovanni-Vincentelli, A.},
   Title = {Architecture of field-programmable gate arrays},
   Journal = {Proceedings of the IEEE},
   Volume = {81},
   Number = {7},
   Pages = {1013-1029},
   Note = {0018-9219},
   Abstract = {A survey of field-programmable gate array (FPGA) architectures and the programming technologies used to customize them is presented. Programming technologies are compared on the basis of their volatility, size parasitic capacitance, resistance, and process technology complexity. FPGA architectures are divided into two constituents: logic block architectures and routing architectures. A classification of logic blocks based on their granularity is proposed, and several logic blocks used in commercially available FPGAs are described. A brief review of recent results on the effect of logic block granularity on logic density and performance of an FPGA is then presented. Several commercial routing architectures are described in the context of a general routing architecture model. Finally, recent results on the tradeoff between the flexibility of an FPGA routing architecture, its routability, and its density are reviewed},
   Keywords = {PLD programming
application specific integrated circuits
logic arrays
network routing
FPGA architectures
field-programmable gate arrays
granularity
logic block architectures
logic density
parasitic capacitance
process technology complexity
programming technologies
resistance
routing architectures},
   Year = {1993} }



@article{
3D:SAK97,
   Author = {Sakamura, K.},
   Title = {Guest Editor's Introduction: Advanced Dram Technology},
   Journal = {Micro, IEEE},
   Volume = {17},
   Number = {6},
   Pages = {8-9},
   Note = {0272-1732},
   Abstract = {N/A},
      Year = {1997} }



@article{
3D:SSY97,
   Author = {Sase, I. and Shimizu, N. and Yoshikawa, T.},
   Title = {Multimedia LSI accelerator with embedded DRAM},
   Journal = {Micro, IEEE},
   Volume = {17},
   Number = {6},
   Pages = {49-54},
   Note = {0272-1732},
   Abstract = {To succeed in the graphics controllers market, it is important to take advantage of embedded DRAMs, which provide low power consumption, low electromagnetic interference (EMI), smaller board space, and frame memory flexibility (capacity, access speed, and bandwidth). These capabilities benefit portable PC applications in which board space and power consumption are serious considerations. The MSM7680 accelerator is ideally suited for a compact multimedia system because of its smaller, embedded DRAM capacity. The MSM7680 provides high performance and/or a one-chip solution for many graphics display systems. It also integrates the frame buffer with graphics controller functions such as a 2D drawing engine, MPEG-1 decoder, digital/analog converter for RGB analog output, and a clock generator phase-locked loop. The MSM7680 multimedia ccelerator uses a 256-bit data bus with embedded 1.25-Mbyte DRAM to increase the rate of data transfers and decrease power consumption},
   Keywords = {DRAM chips
coprocessors
large scale integration
microcontrollers
multimedia computing
2D drawing engine
MPEG-1 decoder
MSM7680 accelerator
clock generator phase-locked loop
digital/analog converter
electromagnetic interference
embedded DRAM
frame memory flexibility
graphics controllers
multimedia LSI accelerator},
   Year = {1997} }



@inproceedings{
3D:SRE+04,
   Author = {Shiu, P. H. and Ravichandran, R. and Easwar, S. and Lim, S. K.},
   Title = {Multi-layer floorplanning for reliable system-on-package},
   BookTitle = {ISCAS},
   Volume = {5},
   Pages = {69-72},
   Abstract = {Physical design automation for the new emerging mixed-signal system-on-package (SOP) technology requires a new kind of floorplanner - it must place both active components such as digital IC, analog ICs, memory modules, MEMS, and optoelectronic modules, and embedded passive components such as capacitors, resistors and inductors in a multi-layer packaging substrate while considering various signal integrity issues. We propose a new interconnect-centric multi-layer floorplanner named MF-SOP, which is based on a multiple objective stochastic simulated annealing method. The contribution of this work is to first formulate this new kind of floorplanning problem and then to develop an effective algorithm that handles various design constraints unique to SOP. The related experiments show that the area reduction of MF-SOP compared to its 2D counterpart is on the order of O(k) and wirelength reduction is 39% average for k-layer SOP, while satisfying design constraints.},
   Keywords = {circuit layout CAD
integrated circuit layout
integrated circuit packaging
integrated circuit reliability
mixed analogue-digital integrated circuits
multilayers
simulated annealing
system-on-chip
MEMS
MF-SOP
active component placement
analog IC
area reduction
capacitors
digital IC
embedded passive components
inductors
interconnect-centric floorplanner
k-layer SOP
memory modules
mixed-signal system-on-package
multilayer floorplanner
multilayer floorplanning
multilayer packaging substrate
optoelectronic modules
physical design automation
resistors
signal integrity
stochastic simulated annealing
wirelength reduction},
   Year = {2004} }



@inproceedings{
3D:CACTI,
   Author = {Shivakumar, P and Joupp, N.P.},
   Title = {Cacti 3.0: An integrated cache timing, power, and area model},
   BookTitle = {Western Research Lab Research Report},
      Year = {2001} }



@inproceedings{
3D:SSH+03,
   Author = {Skadron, K. and Stan, M. R. and Huang, W. and Sivakumar, Velusamy and Karthik, Sankaranarayanan and Tarjan, D.},
   Title = {Temperature-aware microarchitecture},
   BookTitle = {International Symposium on Computer Architecture},
   Pages = {2-13},
   Abstract = {With power density and hence cooling costs rising exponentially, processor packaging can no longer be designed for the worst case, and there is an urgent need for runtime processor-level techniques that can regulate operating temperature when the package's capacity is exceeded. Evaluating such techniques, however, requires a thermal model that is practical for architectural studies. We describe HotSpot, an accurate yet fast model based on an equivalent circuit of thermal resistances and capacitances that correspond to microarchitecture blocks and essential aspects of the thermal package. Validation was performed using finite-element simulation. We also introduce several effective methods for dynamic thermal management (DTM): "temperature-tracking" frequency scaling, localized toggling, and migrating computation to spare hardware units. Modeling temperature at the microarchitecture level also shows that power metrics are poor predictors of temperature, and that sensor imprecision has a substantial impact on the performance of DTM.},
   Keywords = {cooling
finite element analysis
heat sinks
ntegrated circuit packaging
microprocessor chips
power engineering computing
temperature measurement
temperature sensors
thermal management (packaging)
dynamic thermal management
finite element simulation
frequency scaling
hotspot thermal model
localized toggling
power-performance simulator
real-time temperature sensor
spare hardware unit computation
temperature tracking
temperature-aware microarchitecture},
   Year = {2003} }



@inproceedings{
3D:SJW+04,
   Author = {Soon-Moon, Jung and Jaehoon, Jang and Wonseok, Cho and Jaehwan, Moon and Kunho, Kwak and Bonghyun, Choi and Byungjun, Hwang and Hoon, Lim and Jaehun, Jeong and Jonghyuk, Kim and Kinam, Kim},
   Title = {The revolutionary and truly 3-dimensional 25F/sup 2/ SRAM technology with the smallest S/sup 3/ ( stacked single-crystal Si) cell, 0.16um/sup 2/, and SSTFT (atacked single-crystal thin film transistor) for ultra high density SRAM},
   BookTitle = {Symposium on VLSI Technology},
   Pages = {228-229},
   Abstract = {The smallest 25F/sup 2/ SRAM cell size of 0.16um/sup 2/ is realized by S/sup 3/ cell technology and SSTFT with 193nm ArF lithography process. The stacked single-crystal thin film is developed and used for the first time in the SRAM cell to make the SRAM products comparative to the DRAM products in the density and the cost. The load PMOS and pass NMOS transistors are stacked over the planar pull-down NMOS transistors to drastically reduce the cell size. In this study, the dream of truly 3D memory device is achieved by fabricating 64M bit density SRAM.},
   Keywords = {MOSFET
SRAM chips
elemental semiconductors
silicon
thin film transistors
3D memory device
SRAM technology
stacked single-crystal Si
stacked single-crystal thin film transistor
ultra high density SRAM},
   Year = {2004} }



@inproceedings{
3D:SBK+00,
   Author = {Souri, S. J. and Banerjee, K. and Mehrotra, A. and Saraswat, K. C.},
   Title = {Multiple Si layer ICs: motivation, performance analysis, and design implications},
   BookTitle = {DAC},
   Pages = {213-220},
      Year = {2000} }



@inproceedings{
3D:TTT+04,
   Author = {Takahashi, K. and Taguchi, Y. and Tomisaka, M. and Yonemura, H. and Hoshino, M. and Ueno, M. and Egawa, Y. and Nemoto, Y. and Yamaji, Y. and Terao, H. and Umemoto, M. and Kameyama, K. and Suzuki, A. and Okayama, Y. and Yonezawa, T. and Kondo, K.},
   Title = {Process integration of 3D chip stack with vertical interconnection},
   BookTitle = {Electronic Components and Technology, 2004. ECTC '04. Proceedings },
   Volume = {1},
   Pages = {601-609 Vol.1},
   Abstract = {We succeeded in developing high-speed electrodeposition and high-rate CMP processes that greatly reduced the cost of Cu through-via fabrication used for three-dimensional (3D) chip stacking. Thin-wafer-handling processes were integrated with the development of wafer bonding and debonding equipment and processes. The investigation of thermal characteristics revealed the important structural guidelines for heat dissipation. Finally, the difficult challenges of 3D chip stacking, cost issues, wafer-handling issues and thermal issues, as well as fine pitch interconnection and electrical performance evaluation, have been established. Part of the achievements were applied to practical use in a commercial application.},
   Keywords = {chemical mechanical polishing
cooling
copper
fine-pitch technology
integrated circuit interconnections
integrated circuit packaging
materials handling
multichip modules
thermal management (packaging)
3D chip stack process integration
3D chip stacking
Cu
Cu through-via fabrication
cost issues
electrical performance evaluation
fine pitch interconnection
high-rate CMP processes
high-speed electrodeposition processes
structural heat dissipation guidelines
thermal characteristics
thin-wafer-handling
vertical interconnection
wafer bonding
wafer bonding equipment
wafer debonding
wafer-handling issues},
   Year = {2004} }



@inproceedings{
   Author = {Ting-Yen, Chiang and Souri, S. J. nd Chi On, Chui and Saraswat, K. C.},
   Title = {Thermal analysis of heterogeneous 3D ICs with various integration scenarios},
   Pages = {31.2.1-31.2.4},
   Abstract = {Presents detailed thermal analysis of high performance three dimensional (3D) ICs under various integration schemes. The model incorporates the effect of vias and power consumption due to both devices in active layers and interconnect joule heating. The results show excellent agreement with the 3D finite element simulations using ANSYS. It is shown that under certain scenarios, 3D ICs can actually lead to better thermal performance than planar (2D) ICs. With the effect of vias, as efficient heat dissipation paths, taken into account, our model provides more realistic temperature rise estimation for 3D ICs. Furthermore, tradeoffs among power, performance, chip real estate and thermal impact for 3D ICs is evaluated. Finally, the thermal influence from incorporating RF circuits and optical interconnect on 3D ICs has been discussed},
   Keywords = {finite element analysis
integrated circuit design
integrated circuit interconnections
integrated circuit modelling
low-power electronics
optical interconnections
thermal analysis
ANSYS
RF circuits
active layers
chip real estate
finite element simulations
heat dissipation paths
heterogeneous 3D ICs
interconnect joule heating
optical interconnect
power consumption
temperature rise estimation
thermal analysis
vias},
   Year = {2001} }



@inproceedings{
3D:TKK+02,
   Author = {Tiwari, S. and Kim, H. S. and Kim, S. and Kumar, A. and Liu, C. C. and Xue, L.},
   Title = {Three-dimensional integration in silicon electronics},
   BookTitle = {High Performance Devices, 2002. Proceedings. IEEE Lester Eastman Conference on },
   Pages = {24-33},
   Abstract = {As silicon electronics reaches length scales of 100 to 10 nm, device densities of 10/sup 9/ to 10/sup 11/ cm/sup -2/, interconnect densities of 10/sup 10/ to 10/sup 12/ cm/sup -2/, and applications across the spectrum of digital, analog, and mixed-signal domain, a number of key issues arise related to maintaining the improvement in performance, cost, power, and designability. Three-dimensional integration incorporating planar transistors offers interesting new directions for continuing improvements. Adaptive modifications of the planar transistors offer higher scalability and functionality, higher vertical interconnectivity in between device planes can reduce interconnect delays, higher programmability using configurable elements can provide efficient signal and energy flow, higher digital-analog isolation using ground-planes can provide cross-talk improvements for mixed-signal applications, and a power-aware design can allow control of temperature and power dissipation.},
   Keywords = {CMOS integrated circuits
SRAM chips
integrated circuit design
integrated circuit interconnections
integrated circuit manufacture
interference (signal)
low-power electronics
mixed analogue-digital integrated circuits
programmable circuits
thermal management (packaging)
100 to 10 nm
CMOS
IC performance/cost/power/designability
SRAM
Si
Si electronics 3D integration
analog IC
configurable element programmability
cross-talk reduction
device density
device plane vertical interconnectivity
digital IC
digital-analog isolation
functionality
ground-planes
interconnect delays
interconnect density
mixed-signal IC
planar transistor adaptive modifications
power dissipation/temperature control
power-aware design
scalability
signal/energy flow
silicon electronics length scales},
   Year = {2002} }



@inproceedings{
3D:TFG+04,
   Author = {Topol, A. W. and Furman, B. K. and Guarini, K. W. and Shi, L. and Cohen, G. M. and Walker, G. F.},
   Title = {Enabling technologies for wafer-level bonding of 3D MEMS and integrated circuit structures},
   BookTitle = {Electronic Components and Technology, 2004. ECTC '04. Proceedings },
   Volume = {1},
   Pages = {931-938 Vol.1},
   Abstract = {In this paper, we describe several critical aspects of wafer scale or die level bonding to demonstrate: (1) low temperature bonding for planar layer interconnections; (2) low temperature bonding for non-planar layer sealing; (3) alignment and transfer of process sub-assemblies such as BEOL wiring, MEMS cavity or active device structures; and (4) integration methodology for fabrication of these layer stacks into 3D circuits and MEMS. We also show examples of how layer stacking protocols using wafer bonding technology provides a capability to integrate mixed materials and technologies potentially adaptable to many other applications. In addition, we demonstrate that in order to evaluate the influence of bonding on the electrical integrity of the transferred ICs, state-of-the art circuits, such as short channel length MOSFETs or ring oscillators, should be tested as they are most sensitive to environmental/processing changes.},
   Keywords = {integrated circuit interconnections
microassembling
micromechanical devices
seals (stoppers)
wafer bonding
3D MEMS
3D integrated circuit structures
BEOL wiring
MEMS cavity structures
active device structures
die level bonding
layer stacking protocols
layer stacks
low temperature bonding
nonplanar layer sealing
planar layer interconnections
process sub-assembly transfer
ring oscillators
short channel length MOSFET
sub-assembly wiring
wafer scale bonding
wafer-level bonding},
   Year = {2004} }



@inproceedings{
3D:WRT04,
   Author = {Wilkerson, P. and Raman, A. and Turowski, M.},
   Title = {Fast, automated thermal simulation of three-dimensional integrated circuits},
   BookTitle = {Thermal and Thermomechanical Phenomena in Electronic Systems, 2004. ITHERM '04. The Ninth Intersociety Conference on},
   Pages = {706-713 Vol.1},
   Abstract = {Three-dimensional (3D) stacked integrated circuits (ICs) can significantly improve circuit performance and offer the promise of integrating various technologies (memory, logic, RF, mixed-signal, optoelectronics) within a single block. Lack of 3D design tools and heat dissipation from vertically stacked multiple layers are the crucial problems in their development. To address these issues, CFD Research Corporation (CFDRC) is developing methodologies and tools to analyze and assess coupled electrical and thermal performance of 3D ICs, including calculation of realistic full-chip thermal distributions and determining from them signal delay/distortion. Due to the stacking technology, extensive localized heating can occur. Analysis to minimize these hot spots using thermal vias is demonstrated. Our Python-script based framework allows to drive and control all the aspects of the 3D model building (directly from layouts), thermal simulations, and results extraction/post-processing. Hence, it is a good basis for coupling with Electronic Design Automation (EDA) systems. We present results of automated, fast, but detailed thermal simulations of 3D stacked integrated circuits. In addition, procedures for automatic extraction of reduced and compact thermal-resistance-based 3D models have been implemented. These techniques greatly reduce required computational time, and allow for very fast parametric modeling analysis of 3D IC design configurations and temperature extraction. From these thermal resistance models, equivalent SPICE netlists may be generated and used for independent or coupled thermal analysis.},
   Keywords = {SPICE
automation
convection
cooling
electronic design automation
elemental semiconductors
enthalpy
glass
integrated circuit design
integrated circuit packaging
permittivity
silicon-on-insulator
stacking
thermal analysis
thermal conductivity
thermal resistance
3D model building
3D stacked IC
3D thermal resistance model
CFD Research Corporation
EDA system
Python script based network
Python-script based framework
RF integration
SPICE
Si
automated thermal simulation
electrical properties
electronic design automation system
full chip thermal distribution
heat dissipation
heating
hot spots
logic integration
memor integration
mixed signal integration
multilayers
optoelectronics
signal delay
thermal analysis
thermal properties
thermal resistance
thermal signal delay
thermal signal distortion
thermal simulations
three-dimensional stacked integrated circuits
vertically stacked multiple layers},
   Year = {2004} }



@article{
3D:WCD+99,
   Author = {Wuytack, S. and Catthoor, F. and De Jong, G. and De Man, H. J.},
   Title = {Minimizing the required memory bandwidth in VLSI system realizations},
   Journal = {Very Large Scale Integration (VLSI) Systems, IEEE Transactions on},
   Volume = {7},
   Number = {4},
   Pages = {433-441},
   Note = {1063-8210},
   Abstract = {In this paper, we present the problem of storage bandwidth optimization (SBO) in VLSI system realizations. Our goal is to minimize the required memory bandwidth within the given cycle budget by adding ordering constraints to the flow graph. This allows the subsequent memory allocation and assignment tasks to come up with a cheaper memory architecture with less memories and memory ports. The importance and the effect of SBO is shown on realistic examples both in the video and asynchronous transfer-mode (ATM) domains. We show that it is important to take into account which data is being accessed in parallel, instead of only considering the number of simultaneous memory accesses. Our problem formulation leads to the optimization of a conflict (hyper) graph. For the target domain of ATM, only flat graphs without loops have to be treated. For this subproblem, a prototype tool has been implemented to demonstrate the feasibility of automating this important system design step},
   Keywords = {VLSI
asynchronous transfer mode
data flow graphs
memory architecture
processor scheduling
resource allocation
storage management
VLSI system realizations
assignment tasks
asynchronous transfer-mode
conflict graph
cycle budget
flat graphs without loops
flow graph
high-level memory management
hyper graph
low-power design
memory allocation
memory architecture
ordering constraints
partial ordering
prototype tool
required memory bandwidth minimization
storage bandwidth optimization
video processing},
   Year = {1999} }



@inproceedings{
3D:YYV+05,
   Author = {Y. Tsai and Y. Xie and V. Narayanan and Irwin, M. J.},
   Title = {{Three-dimensional cache design exploration using 3DCacti}},
   BookTitle = {International Conference on Computer Design},
   Pages = {519-524},
   Abstract = {As technology scales, interconnects dominate the performance and power behavior of deep submicron designs. Three-dimensional integrated circuits (3D ICs) have been proposed as a way to mitigate the interconnect challenges. In this paper, we explore the architectural design of cache memories using 3D circuits. We present a delay and energy model, 3DCacti, to explore different 3D design options of partitioning a cache. The tool allows partitioning of the cache across different device layers at various levels of granularity. The tool has been validated by comparing its results with those obtained from circuit simulation of custom 3D layouts. We also explore the effects of various cache partitioning parameters and 3D technology parameters on delay and energy to demonstrate the utility of the tool.},
   Keywords = {cache storage
integrated circuit interconnections
integrated circuit layout
logic CAD
memory architecture
3D integrated circuits
3D layouts
3DCacti
cache memories
cache partitioning
circuit simulation
deep submicron designs
delay model
energy model
interconnects},
   Year = {2005} }



@inproceedings{
3D:YYG+00,
   Author = {Yun-Chih, Chang and Yao-Wen, Chang and Guang-Ming, Wu and Shu-Wei, Wu},
   Title = {B*-trees: a new representation for non-slicing floorplans},
   BookTitle = {DAC},
   Pages = {458-463},
      Year = {2000} }



@article{
3d-ZLR+05,
   Author = {Zeng, A. and Lu, J. and Rose, K. and Gutmann, R.J.},
   Title = {First-order Performance Prediction of Cache Memory with Wafer-level 3D-Integration},
   Journal = {IEEE Design and Test of Computers},
   Volume = {22},
   Number = {6},
   Pages = {548- 555},
   Abstract = {The advantages of 3D design can be exploited by reducing the memory access time. In this article, the authors use a simulator based on analytical models to build an optimal processor-memory configuration for two designs: a graphics processor and a microprocessor. One emerging alternative approach to relieving these interconnect constraints is the use of wafer-level 3D integration, which provides a high density of high-performance, low-parasitic vertical interconnects. A wafer-level 3D design is partitionable into multiple chips connected by short vertical vias. This arrangement reduces the length of many global interconnects without introducing any logic complexity. Wafer-level 3D integration also reduces the required number of repeaters, thereby improving the area efficiency and reducing the power consumed within the interconnect network. With micron-size interwafer vias, wafer-level 3D integration allows a large memory bandwidth with little wafer area consumption. We have developed a software program that allows a first-order comparison of cache designs in 2D and 3D IC technologies. We present a first-order estimate of the performance improvements achieved by 3D implementation of cache memory, with emphasis on large caches in deep-submicron technologies.},
      Year = {2005} }



@inproceedings{
3D:ZRG04,
   Author = {Zeng, A. Y. and Rose, K. and Gutmann, R. J.},
   Title = {Cache array architecture optimization at deep submicron technologies},
   BookTitle = {Computer Design: VLSI in Computers and Processors, 2004. ICCD 2004. Proceedings. IEEE International Conference on },
   Pages = {320-325},
   Abstract = {A cache access time model, PRACTICS (predictor of access and cycle time for cache stack), has been developed to optimize the memory array architecture for the minimum access and cycle times of on-chip memory using circuit models based on Cadence simulations. Lumped RC models have been used to approximate the distributed RC interconnect network in the access time models. Both SRAM and DRAM models have been validated with industrial designs. The limited influences of gate far-out and transistor size on the cache array architecture indicate that interconnect delay is dominant at deep submicron technologies.},
   Keywords = {DRAM chips
RC circuits
SRAM chips
cache storage
circuit optimisation
circuit simulation
integrated circuit design
integrated circuit interconnections
integrated circuit modelling
lumped parameter networks
memory architecture
Cadence simulation
DRAM models
SRAM models
cache access time model
cache array architecture optimization
cache stack
circuit models
deep submicron technology
distributed RC interconnect network
interconnect delay
lumped RC models
memory array architecture
on-chip memory},
   Year = {2004} }



@inproceedings{
   Author = {Zhang, T. and Sapatnekar, S. S.},
   Title = {Temperature-Aware Routing in 3D ICs,},
   BookTitle = {Asia-South Pacific Design Automation Conference},
   Pages = {309-314},
      Year = {2006} }
